In [150]:
import numpy as np
import pandas as pd
from pyproj import Geod
import datetime
from matplotlib import pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("error")

In [148]:
train_data = pd.read_csv( "../input/train.csv" )
test_data = pd.read_csv( "../input/test.csv" )


In [59]:
# Utiliy functions

#Get distance between pairs of lat-lon points
wgs84_geod = Geod(ellps='WGS84')
def get_distance(lat1,lon1,lat2,lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
    return dist

# Convert time object to seconds
def to_seconds(time):
    return (time.hour * 60 + time.minute) * 60 + time.second

In [262]:
def normalize(data):
    return ((data - data.min()) / (data.max() - data.min()))

def process_data(data):
    # Calculating distance (km) based on longitude/latituides and adding it in a new column 'dist'
    data['dist'] = get_distance(data['pickup_latitude'].tolist(), data['pickup_longitude'].tolist(),
                                data['dropoff_latitude'].tolist(), data['dropoff_longitude'].tolist())
    
    data['dist'] = data['dist'] / 1000
#     data['dist_sqr'] = data['dist']**2
    
    # Replacing N of store_and_fwd_flag with 0 and Y with 1
    data = data.replace({'N': 0, 'Y': 1})

#     data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])    
#     data['pickup_day_of_week'] = data.pickup_datetime.dt.weekday
#     data['pickup_day_of_month'] = data.pickup_datetime.dt.days_in_month
#     data['pickup_month'] = data.pickup_datetime.dt.month
#     data['pickup_hour'] = data.pickup_datetime.dt.hour
#     data['pickup_min'] = data.pickup_datetime.dt.minute
#     data['is_weekend'] = data.pickup_day_of_week.map(lambda x: 1 if x >= 5 else 0)
#     data['is_night_time'] = [1 if (i==0 or i>=19)  else 0 for i in data['pickup_hour']]
#     data['pickup_latitude'] = data['pickup_latitude']**2
#     data['pickup_longitude'] = data['pickup_longitude']**2
#     data['dropoff_longitude'] = data['dropoff_longitude']**2
#     data['dropoff_latitude'] = data['dropoff_latitude']**2
    
    # Dropping columns no longer required
    data = data.drop([ 'pickup_latitude', 'pickup_longitude', 'pickup_datetime',
                       'dropoff_latitude','dropoff_longitude', 'id'], axis=1)


#     data = data.drop(['pickup_datetime', 'id'], axis=1)
            
    return data

In [259]:
# Hypothesis function
def hyp(theta, X):
    return np.dot(X, theta.T)  

# The loss function in our case is the sum of the squared error
def loss_func(theta, X, Y):
    return np.sum(((hyp(theta, X) - Y)**2) / (2 * X.shape[0]))

def get_gradient(theta, X, Y):
    derivatives = []
  
    for i in range(0, X.shape[1]):
        derivatives.append(np.sum((hyp(theta, X) - Y) * X[:, i]) / X.shape[0])

    return np.array(derivatives)

def gradient_descent(X, Y, maxniter):
    thetas = np.random.rand(X.shape[1],)
    alpha = 0.001
    costs = []
    
    for i in range(0, maxniter):
        thetas = thetas - (alpha * get_gradient(thetas, X, Y))
        costs.append(loss_func(thetas, X, Y))
        
    return thetas, costs

In [265]:
proc_data = process_data(train_data)
X = proc_data.drop('trip_duration', axis=1).values
Y = proc_data['trip_duration'].values

max_iters = 3500
thetas, costs = gradient_descent(X, Y, max_iters)
plt.ylabel('Loss')
plt.xlabel('Iterations')
plt.plot(np.arange(0, max_iters), costs)

RuntimeWarning: overflow encountered in square

In [264]:
proc_data = process_data(train_data)
proc_data
# sns.boxplot(proc_data['dist_sqr'])
# proc_data['dist_sqr'].describe()

Unnamed: 0,vendor_id,passenger_count,store_and_fwd_flag,trip_duration,dist,dist_sqr
0,1,1,0,617,3.900996,15.217771
1,2,5,0,546,2.681275,7.189234
2,2,1,0,678,3.723877,13.867262
3,2,1,0,386,1.489451,2.218465
4,2,1,0,850,1.234102,1.523009
5,2,1,0,864,2.377892,5.654370
6,1,2,0,1196,2.562133,6.564524
7,2,3,0,305,0.889656,0.791488
8,1,1,0,77,0.825548,0.681529
9,2,1,0,498,2.388412,5.704514


In [224]:
pred = hyp(thetas, process_data(test_data))
pd.DataFrame(pred).describe()

# submission = pd.DataFrame({'id':test_data['id'],'trip_duration':pred})
# submission.to_csv("submission.csv", index=False)

# print(pd.DataFrame(pred).describe())
# print(submission)

Unnamed: 0,0
count,30000.0
mean,942.179629
std,562.518045
min,306.233429
25%,611.39839
50%,800.443544
75%,1043.878722
max,8183.66766


In [None]:
# Removing outliers

# 1-
# temp = train_data[train_data.trip_duration <= np.percentile(train_data.trip_duration, 99)]

# 2-
# temp = process_data(train_data)
# temp = temp[(temp['trip_duration'] >= 120) & (temp['trip_duration'] <= 3000)]
# temp = temp[(temp['dist'] > 0)]
# temp = temp[temp.dist <= np.percentile(temp.dist, 98)]

# print(temp.passenger_count.describe())

# print(temp.trip_duration.describe())

# fig, ax = plt.subplots()
# sns.distplot(temp['trip_duration'], hist=False, rug=True)
# sns.distplot(temp['dist'], hist=False, rug=True)