In [239]:
import numpy as np
import pandas as pd
from pyproj import Geod
import datetime
from matplotlib import pyplot as plt
import seaborn as snss
import haversine
from pandas.tseries.holiday import USFederalHolidayCalendar

In [240]:
train_data = pd.read_csv( "train.csv" )
test_data = pd.read_csv( "test.csv" )

In [241]:
# Utiliy functions

#Get distance between pairs of lat-lon points
wgs84_geod = Geod(ellps='WGS84')
def get_distance(lat1,lon1,lat2,lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
    return dist

In [242]:
def normalize(data):
    return ((data-data.min())/(data.max()-data.min()))


def process_data(data):
    # Calculating distance (m) based on longitude/latituides and adding it in a new column 'dist'
    data['dist'] = get_distance(data['pickup_latitude'].tolist(), data['pickup_longitude'].tolist(),
                                data['dropoff_latitude'].tolist(), data['dropoff_longitude'].tolist())
    
    data['dist'] = data['dist'] / 1000
    data['haversine_distance'] = data.apply(lambda r: haversine.haversine((r['pickup_latitude'],r['pickup_longitude']),
                                                                          (r['dropoff_latitude'], r['dropoff_longitude'])), 
                                                                          axis=1)
    data['manhattan_distance'] = (abs(data.dropoff_longitude - data.pickup_longitude) +
                                  abs(data.dropoff_latitude - data.pickup_latitude))
    
    data['log_distance'] = np.log(data['dist'] + 1)
    data['log_haversine_distance'] = np.log(data['haversine_distance'] + 1)
    data['log_manhattan_distance'] = np.log(data.manhattan_distance + 1)
    
    # Replacing N of store_and_fwd_flag with 0 and Y with 1
    data = data.replace({'N': 0, 'Y': 1})

    # Dropping columns no longer required
    data = data.drop(['pickup_latitude','pickup_longitude', 'pickup_datetime',
                      'dropoff_latitude','dropoff_longitude', 'id'], axis=1)
     
    return data

In [243]:
# Hypothesis function
def hyp(theta, X):
    return np.dot(X, theta.T)  

# The loss function in our case is the sum of the squared error
def loss_func(theta, X, Y):
    return np.sum(((hyp(theta, X) - Y)**2) / (2 * X.shape[0]))

def get_graident(theta, X, Y):
    derivatives = []
  
    for i in range(0, X.shape[1]):
        derivatives.append(np.sum((hyp(theta, X) - Y) * X[:, i]) / X.shape[0])

    return np.array(derivatives)


def gradient_descent(X, Y, maxniter=20000):
    thetas = np.random.rand(X.shape[1],)
    alpha = 0.01
    costs = []
    
    for i in range(0, maxniter):
        thetas = thetas - (alpha * get_graident(thetas, X, Y))
        costs.append(loss_func(thetas, X, Y))
        
    return thetas, costs

In [244]:
proc_data = process_data(train_data)
X = normalize(proc_data.drop('trip_duration', axis=1)).values
Y = proc_data['trip_duration'].values

max_iters = 10000
thetas, costs = gradient_descent(X, Y, max_iters)
plt.ylabel('Loss')
plt.xlabel('Iterations')
plt.plot(np.arange(0, max_iters), costs)

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
np.mean(X[:, -1])

In [None]:
thetas

In [None]:
pred = hyp(thetas, normalize(process_data(test_data)))
submission = pd.DataFrame({'id':test_data['id'],'trip_duration':pred})
submission.to_csv("kaggle_submission.csv", index=False)

submission['trip_duration'].describe()

In [None]:
submission