In [None]:
import numpy as np
import pandas as pd
from pyproj import Geod
import datetime
from matplotlib import pyplot as plt
import seaborn as snss

In [None]:
train_data = pd.read_csv( "../input/train.csv" )
test_data = pd.read_csv( "../input/test.csv" )

In [None]:
# Utiliy functions

#Get distance between pairs of lat-lon points
wgs84_geod = Geod(ellps='WGS84')
def get_distance(lat1,lon1,lat2,lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
    return dist

# Convert time object to seconds
def to_seconds(time):
    return (time.hour * 60 + time.minute) * 60 + time.second

In [None]:
def normalize(data):
    return ((data - data.min()) / (data.max() - data.min()))

def process_data(data):
    # Calculating distance (km) based on longitude/latituides and adding it in a new column 'dist'
    data['dist'] = get_distance(data['pickup_latitude'].tolist(), data['pickup_longitude'].tolist(),
                                data['dropoff_latitude'].tolist(), data['dropoff_longitude'].tolist())
    
    data['dist'] = data['dist'] / 1000
    
    # Replacing N of store_and_fwd_flag with 0 and Y with 1
    data = data.replace({'N': 0, 'Y': 1})

    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])    
    data['pickup_weekday'] = data.pickup_datetime.dt.weekday
    data['pickup_month'] = data.pickup_datetime.dt.month
    data['pickup_hour'] = data.pickup_datetime.dt.hour
    data['pickup_min'] = data.pickup_datetime.dt.minute
    data['pickup_second'] = data.pickup_datetime.dt.second
    data['is_weekend'] = data.pickup_weekday.map(lambda x: 1 if x >= 5 else 0)
    
    # Dropping columns no longer required
    data = data.drop(['pickup_latitude','pickup_longitude', 'pickup_datetime',
                      'dropoff_latitude','dropoff_longitude', 'id'], axis=1)
     
    return data

In [None]:
# Hypothesis function
def hyp(theta, X):
    return np.dot(X, theta.T)  

# The loss function in our case is the sum of the squared error
def loss_func(theta, X, Y):
    return np.sum(((hyp(theta, X) - Y)**2) / (2 * X.shape[0]))

def get_gradient(theta, X, Y):
    derivatives = []
  
    for i in range(0, X.shape[1]):
        derivatives.append(np.sum((hyp(theta, X) - Y) * X[:, i]) / X.shape[0])

    return np.array(derivatives)

def gradient_descent(X, Y, maxniter=20000):
    thetas = np.random.rand(X.shape[1],)
    alpha = 0.001
    costs = []
    
    for i in range(0, maxniter):
        thetas = thetas - (alpha * get_gradient(thetas, X, Y))
        costs.append(loss_func(thetas, X, Y))
        
    return thetas, costs

In [None]:
proc_data = process_data(train_data)
# proc_data = proc_data[(proc_data['trip_duration'] >= 120) & (proc_data['trip_duration'] <= 3000)]
# proc_data = proc_data[proc_data.trip_duration <= np.percentile(proc_data.trip_duration, 99)]
# proc_data = proc_data[proc_data.dist <= np.percentile(proc_data.dist, 98)]
X = proc_data.drop('trip_duration', axis=1).values
Y = proc_data['trip_duration'].values

max_iters = 5000
thetas, costs = gradient_descent(X, Y, max_iters)
plt.ylabel('Loss')
plt.xlabel('Iterations')
plt.plot(np.arange(0, max_iters), costs)

In [None]:
pred = hyp(thetas, process_data(test_data))

submission = pd.DataFrame({'id':test_data['id'],'trip_duration':pred})
submission.to_csv("submission.csv", index=False)

print(pd.DataFrame(pred).describe())
print(submission)

count  30000.000000
mean     951.616440
std      551.389681
min      266.693274
25%      634.700283
50%      808.378487
75%     1046.484861
max     8028.482600

In [None]:
# Removing outliers

# 1-
# temp = train_data[train_data.trip_duration <= np.percentile(train_data.trip_duration, 99)]

# 2-
# temp = process_data(train_data)
# temp = temp[(temp['trip_duration'] >= 120) & (temp['trip_duration'] <= 3000)]
# temp = temp[(temp['dist'] > 0)]
# temp = temp[temp.dist <= np.percentile(temp.dist, 98)]

# print(temp.passenger_count.describe())

# print(temp.trip_duration.describe())

# fig, ax = plt.subplots()
# sns.distplot(temp['trip_duration'], hist=False, rug=True)
# sns.distplot(temp['dist'], hist=False, rug=True)