In [15]:
import numpy as np
import pandas as pd
from pyproj import Geod
import datetime
from matplotlib import pyplot as plt

In [16]:
train_data = pd.read_csv( "train.csv" )
test_data = pd.read_csv( "test.csv" )

In [17]:
# Utiliy functions

#Get distance between pairs of lat-lon points
wgs84_geod = Geod(ellps='WGS84')
def get_distance(lat1,lon1,lat2,lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
    return dist

# Convert time object to seconds
def to_seconds(time):
    return (time.hour * 60 + time.minute) * 60 + time.second

In [18]:
def process_data(data):
    # Calculating distance (m) based on longitude/latituides and adding it in a new column 'dist'
    data['dist'] = get_distance(data['pickup_latitude'].tolist(), data['pickup_longitude'].tolist(),
                                      data['dropoff_latitude'].tolist(), data['dropoff_longitude'].tolist())

    # Replacing N of store_and_fwd_flag with 0 and Y with 1
    data = data.replace({'N': 0, 'Y': 1})

    # Extracting day and time from 'pickup_datetime' column and adding them as 2 new columns
    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
    # new_dates, new_times = zip(*[(d.weekday(), to_seconds(d.time())) for d in data['pickup_datetime']])
    new_dates, new_times = zip(*[(d.weekday(), d.time().hour) for d in data['pickup_datetime']])
    data = data.assign(pickup_day=new_dates, pickup_hour=new_times)

    # Dropping columns no longer required
    data = data.drop(['pickup_latitude','pickup_longitude', 'pickup_datetime',
                      'dropoff_latitude','dropoff_longitude', 'id'], axis=1)

    # Re-arranging columns in train_data 
#     data = data[['vendor_id', 'passenger_count', 'store_and_fwd_flag', 
#                  'dist', 'pickup_day', 'pickup_hour']]
    
#     return (data - data.mean()) / data.std() # Normalizing our data
    return data

In [19]:
# Hypothesis function
def hyp(theta, X):
    return np.dot(X, theta.T)  

# The loss function in our case is the sum of the squared error
def loss_func(theta, X, Y):
    return np.sum(((hyp(theta, X) - Y)**2) / (2 * X.shape[0]))

def get_graident(theta, X, Y):
    derivatives = []
  
    for i in range(0, X.shape[1]):
        derivatives.append(np.sum((hyp(theta, X) - Y) * X[:, i]) / X.shape[0])

    return np.array(derivatives)


def gradient_descent(X, Y, maxniter=20000):
    thetas = np.random.rand(X.shape[1],)
    alpha = 0.001
    costs = []
    
    for i in range(0, maxniter):
        thetas = thetas - (alpha * get_graident(thetas, X, Y))
        costs.append(loss_func(thetas, X, Y))
        
    return thetas, costs

In [20]:
train_data_proc = process_data(train_data)

X = train_data_proc.drop('trip_duration', axis=1).values
Y = train_data_proc['trip_duration'].values

max_iters = 20000
thetas, costs = gradient_descent(X, Y, max_iters)

plt.ylabel('Loss')
plt.xlabel('Iterations')
plt.plot(np.arange(0, max_iters), costs)

  import sys
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  del sys.path[0]


KeyboardInterrupt: 

In [None]:
test_data_proc = process_data(test_data)
pred = hyp(thetas, test_data_proc)
pred