In [2]:
import pandas as pd 
import time 
import numpy as np
import datetime as dt

PATH = r"C:\Users\Rj\train.csv"

start_time = time.time()

sample_cols_to_keep =['fare_amount','pickup_datetime','pickup_longitude','pickup_latitude',
                      'dropoff_longitude','dropoff_latitude','passenger_count']

# First setup dataframe iterator, ‘usecols’ parameter filters the columns, and 'chunksize' 
# sets the number of rows per chunk in the csv.
# we are not using 10M rows out of 55M rows for faster execution
df_iter = pd.read_csv(PATH, chunksize=100000, usecols=sample_cols_to_keep,nrows= 10000000)

# haversine formulate to calculate total distance between two points on earth.
def haversine(lon1, lat1, lon2, lat2):
    lat1 = np.radians(lat1)
    lat2= np.radians(lat2)
    lon1 = np.radians(lon1)
    lon2 = np.radians(lon2)
    dlat=(lat2-lat1).abs()
    dlon=(lon2-lon1).abs()
    R = 6371 #radius of Earth
    a = (np.sin(dlat/2.0))**2 + np.cos(lat1) * np.cos(lat2) * (np.sin(dlon/2.0))**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# removing timestamp dtype from data and featching and seperating useful data from that
# using new features in the predictive model for better prediction
def add_features(df):
    df['Distance_Travelled'] = haversine(df.dropoff_longitude,df.dropoff_latitude,df.pickup_longitude,df.pickup_latitude)
    df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
    df['hour'] = df.pickup_datetime.dt.hour
    df['day'] = df.pickup_datetime.dt.day
    df['month'] = df.pickup_datetime.dt.month
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['year'] = df.pickup_datetime.dt.year 
    
# this list will store the filtered dataframes for later concatenation 
df_lst = [] 

# Iterate over the file based on the criteria and append to the list
# And removing useless data
for df_ in df_iter: 
        tmp_df = (df_.rename(columns={col: col.lower() for col in df_.columns}) # filter eg. rows where 'col_1' value grater than one
                                  .pipe(lambda x:  x[x.passenger_count > 0])
                                  .pipe(lambda x:  x[x.fare_amount > 1])
                                  .pipe(lambda x:  x[x.fare_amount < 500])
                                  .pipe(lambda x:  x[x.passenger_count < 6])
                                  .pipe(lambda x:  x[x.pickup_longitude < -72])
                                  .pipe(lambda x:  x[x.pickup_longitude > -75])
                                  .pipe(lambda x:  x[x.pickup_latitude > 40.2])
                                  .pipe(lambda x:  x[x.pickup_latitude < 42 ]) 
                                  .pipe(lambda x:  x[x.dropoff_longitude < -72])
                                  .pipe(lambda x:  x[x.dropoff_longitude > -75])
                                  .pipe(lambda x:  x[x.dropoff_latitude > 40])
                                  .pipe(lambda x:  x[x.dropoff_latitude < 42 ]))
        add_features(tmp_df)      
        df_lst += [tmp_df.copy()] 
                                     
print('%s seconds' % (time.time() - start_time))

2229.689437150955 seconds


In [3]:
# And finally combine filtered df_lst into the final lareger output say 'df_final' dataframe 
df_final = pd.concat(df_lst)
# df_final.describe()
print(df_final.shape)

(9546017, 13)


In [5]:

start_time = time.time()

test_df = pd.read_csv(r"C:\Users\Rj\test.csv")
# Also adding more features to test data
add_features(test_df)

print('%s seconds' % (time.time() - start_time))

2.513366937637329 seconds


In [6]:
# Removing rows with null values and removing column with timestamp dtype which is no longer needed.
df_final.dropna(how = 'any', axis = 'rows', inplace=True)
df_final.drop(['pickup_datetime'],axis=1,inplace=True)

In [8]:
from sklearn import preprocessing
# creating feature matrix and label vector
X = df_final[['pickup_longitude','pickup_latitude',
              'dropoff_longitude','dropoff_latitude','passenger_count'
             ,'Distance_Travelled','hour','day','month','weekday','year']]
y=  df_final['fare_amount']
X =preprocessing.scale(X)

In [9]:
start_time = time.time()
# Randomly splitting the original dataset into training set and testing set
# "test_size=0.25" means that pick 30% of data samples for testing set, and the rest (75%) for training set.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

print('%s seconds' % (time.time() - start_time))

4.087108373641968 seconds


In [10]:
start_time = time.time()

# Importing the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 30 decision trees
rf = RandomForestRegressor(n_estimators = 30,bootstrap = True, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train);
print('%s seconds' % (time.time() - start_time))

4541.521301984787 seconds


In [21]:
from sklearn import metrics
# Testing on the testing set:
rf_pred= rf.predict(X_test)

# Calculating MAE MSE RMSE values 
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, rf_pred))
# Calculate mean absolute percentage error (MAPE)
# mape = 100 * (metrics.mean_absolute_error(y_test, rf_pred)/ y_test)
# Calculate and display accuracy
# accuracy = 100 - np.mean(mape)
# print('Accuracy:', round(accuracy, 2), '%.')
print('Mean Squared Error:', metrics.mean_squared_error(y_test, rf_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

Mean Absolute Error: 1.6322588837543093
Mean Squared Error: 13.676028679634898
Root Mean Squared Error: 3.6981115017850525


In [22]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (metrics.mean_absolute_error(y_test, rf_pred)/ y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 79.18 %.


In [26]:
# Predicting on Test cvs data and keeping same feature as feature matrix
rf_pred= rf.predict(test_df.drop(['key','pickup_datetime'], axis=1))

In [27]:
# Creating a dataframe for our prediction and then saving it to a cvs file
result = pd.DataFrame({"key":test_df["key"], "fare_amount": rf_pred},
                         columns = ['key', 'fare_amount'])
result.to_csv (r"C:\Users\Rj\fare.csv", index = False, header=True) 
print("done")

done


In [28]:
# checking the mean of fare_amount to check the correctness of the model
# more the rows we train, model will be more accurate
df = pd.read_csv(r"C:\Users\Rj\fare.csv")
mean = df['fare_amount'].mean()
print(mean)

88.18065718512594
