In [39]:
import numpy as np
import pandas as pd
from pyproj import Geod
import datetime
from matplotlib import pyplot as plt
import seaborn as sns

In [40]:
train_data = pd.read_csv( "../input/train.csv" )
test_data = pd.read_csv( "../input/test.csv" )

In [34]:
# Utiliy functions

#Get distance between pairs of lat-lon points
wgs84_geod = Geod(ellps='WGS84')
def get_distance(lat1,lon1,lat2,lon2):
    az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
    return

# Convert time object to seconds
def to_seconds(time):
    return (time.hour * 60 + time.minute) * 60 + time.second

In [48]:
def normalize(data):
    return (data - data.mean()) / data.std()


def remove_outliers(data, is_test=False):
    if not is_test: 
        data = data[(data['trip_duration'] >= 120) & (data['trip_duration'] <= 3000)]
    
    data = data[(data['dist'] > 0)]
    data = data[data.dist <= np.percentile(data.dist, 98)] 

    return data


def process_data(data):
    # Calculating distance (m) based on longitude/latituides and adding it in a new column 'dist'
    data['dist'] = get_distance(data['pickup_latitude'].tolist(), data['pickup_longitude'].tolist(),
                                data['dropoff_latitude'].tolist(), data['dropoff_longitude'].tolist())
    
    # Replacing N of store_and_fwd_flag with 0 and Y with 1
    data = data.replace({'N': 0, 'Y': 1})

    data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])    
    data['pickup_weekday'] = data.pickup_datetime.dt.weekday
    data['is_weekend'] = data.pickup_weekday.map(lambda x: 1 if x >= 5 else 0)
    
    # Dropping columns no longer required
    data = data.drop(['pickup_latitude','pickup_longitude', 'pickup_datetime',
                      'dropoff_latitude','dropoff_longitude', 'id'], axis=1)
     
    return data

In [49]:
# Hypothesis function
def hyp(theta, X):
    return np.dot(X, theta.T)  

# The loss function in our case is the sum of the squared error
def loss_func(theta, X, Y):
    return np.sum(((hyp(theta, X) - Y)**2) / (2 * X.shape[0]))

def get_graident(theta, X, Y):
    derivatives = []
  
    for i in range(0, X.shape[1]):
        derivatives.append(np.sum((hyp(theta, X) - Y) * X[:, i]) / X.shape[0])

    return np.array(derivatives)


def gradient_descent(X, Y, maxniter=20000):
    thetas = np.random.rand(X.shape[1],)
    alpha = 0.001
    costs = []
    
    for i in range(0, maxniter):
        thetas = thetas - (alpha * get_graident(thetas, X, Y))
        costs.append(loss_func(thetas, X, Y))
        
    return thetas, costs

In [50]:
proc_data = process_data(train_data)
without_outliers = remove_outliers(proc_data)
X = without_outliers.drop('trip_duration', axis=1).values
Y = without_outliers['trip_duration'].values
X = normalize(X) 

max_iters = 5000
thetas, costs = gradient_descent(X, Y, max_iters)

plt.ylabel('Loss')
plt.xlabel('Iterations')
plt.plot(np.arange(0, max_iters), costs)

IndexError: cannot do a non-empty take from an empty axes.

In [None]:
# test_data_proc = process_data(test_data)
# test_wo_outliers = remove_outliers(test_data_proc, is_test=True)
# test_data_norm = normalize(test_wo_outliers)

In [None]:
# pred = np.dot(test_data_norm, thetas)
# pred = np.absolute(pred)
# pred = pred.astype(int)

# submission = pd.DataFrame({'id':test_wo_outliers['id'],'trip_duration':pred})
# submission.to_csv("submission.csv", index=False)
# submission

In [None]:
# train_data.trip_duration.describe()

In [None]:
# Removing outliers

# 1-
# temp = train_data[train_data.trip_duration <= np.percentile(train_data.trip_duration, 99)]

# 2-
# temp = process_data(train_data)
# temp = temp[(temp['trip_duration'] >= 120) & (temp['trip_duration'] <= 3000)]
# temp = temp[(temp['dist'] > 0)]
# temp = temp[temp.dist <= np.percentile(temp.dist, 98)]

# print(temp.passenger_count.describe())

# print(temp.trip_duration.describe())

# fig, ax = plt.subplots()
# sns.distplot(train_data['trip_duration'], hist=False, rug=True)
# sns.distplot(temp['dist'], hist=False, rug=True)

In [None]:
train_data['dist'].head()