### Importing the Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Reading the csv file in a variable
nyc_train = pd.read_csv("Taxifare_train.csv",nrows=2000000)
nyc_train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


### Data Preprocessing

In [3]:
# Displays the basic information like the datatype of the attributes
nyc_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 8 columns):
key                  object
fare_amount          float64
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(5), int64(1), object(2)
memory usage: 122.1+ MB


In [4]:
# Checks the number of null values in the dataset
nyc_train.isnull().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    14
dropoff_latitude     14
passenger_count       0
dtype: int64

In [5]:
# Displays the particular rows that has null values in it
nyc_train[nyc_train.isnull().any(axis = 1)]

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
120227,2012-12-11 12:57:00.00000013,12.5,2012-12-11 12:57:00 UTC,-73.99276,40.743098,,,0
245696,2013-03-21 18:07:07.0000001,86.5,2013-03-21 18:07:07 UTC,-73.991572,40.740591,,,0
340533,2012-12-11 12:50:52.00000010,27.5,2012-12-11 12:50:52 UTC,-73.979639,40.784742,,,0
428108,2011-09-08 09:12:52.0000001,11.8,2011-09-08 09:12:52 UTC,-73.987041,40.751542,,,0
471472,2012-12-11 12:34:20.0000006,7.8,2012-12-11 12:34:20 UTC,0.0,0.0,,,0
524834,2011-09-25 23:01:12.0000005,14.76,2011-09-25 23:01:12 UTC,-73.985374,40.768518,,,0
574023,2013-11-04 20:59:15.0000001,10.2,2013-11-04 20:59:15 UTC,-73.99846,40.745406,,,0
580338,2012-12-11 12:00:53.0000002,21.0,2012-12-11 12:00:53 UTC,-73.974743,40.752057,,,0
794694,2013-11-04 20:07:59.0000006,7.2,2013-11-04 20:07:59 UTC,-73.977048,40.787565,,,0
895400,2011-06-20 11:34:44.0000001,40.0,2011-06-20 11:34:44 UTC,-73.8629,40.7689,,,0


In [6]:
# Drops the null values in a row
nyc_train.dropna(axis = 0,inplace = True)

In [7]:
# Drops the "key" column
nyc_train.drop('key',axis = 1,inplace = True)

In [None]:
# Converts the "pickup_datetime" column to datetime format
nyc_train.pickup_datetime = pd.to_datetime(nyc_train.pickup_datetime)
nyc_train.info()

In [None]:
import datetime,datetime as dt    #from datetime import datetime as dt

In [None]:
#nyc_train['pickup_datetime'].dt.day

In [None]:
# Creating new column 'day' to store the date from 'pickup_datetime' column
nyc_train['day'] = nyc_train['pickup_datetime'].dt.day

In [None]:
# Creating new column 'month' to store the month from 'pickup_datetime' column
nyc_train['month'] = nyc_train['pickup_datetime'].dt.month

In [None]:
# Creating new column 'year' to store the year from 'pickup_datetime' column
nyc_train['year'] = nyc_train['pickup_datetime'].dt.year

In [None]:
# Creating new column 'hour' to store the hour from 'pickup_datetime' column
nyc_train['hour'] = nyc_train['pickup_datetime'].dt.hour

In [None]:
# Creating new column 'minute' to store the minute from 'pickup_datetime' column
nyc_train['minute'] = nyc_train['pickup_datetime'].dt.minute

In [None]:
# Creating new column 'second' to store the second from 'pickup_datetime' column
nyc_train['second'] = nyc_train['pickup_datetime'].dt.second

In [None]:
nyc_train.reset_index(inplace=True)
nyc_train.drop("index", inplace=True, axis=1)

In [None]:
temp = []

In [None]:
# Calculating the Haversine distance to predict the taxi fare
from haversine import haversine
for i in range(len(nyc_train.pickup_latitude)):
     temp.append(haversine((nyc_train.pickup_latitude[i], nyc_train.pickup_longitude[i]),
                     (nyc_train.dropoff_latitude[i], nyc_train.dropoff_longitude[i])))


In [None]:
temp

In [None]:
# Creating a new column called 'Distance' to append the new values
nyc_train['Distance'] = pd.DataFrame(temp)
nyc_train.head()

#### Dropping the columns that are not required anymore

In [None]:
nyc_train.drop('pickup_datetime',axis = 1,inplace = True)

In [None]:
nyc_train.drop('pickup_longitude',axis = 1,inplace = True)

In [None]:
nyc_train.drop('pickup_latitude',axis = 1,inplace = True)

In [None]:
nyc_train.drop('dropoff_longitude',axis = 1,inplace = True)

In [None]:
nyc_train.drop('dropoff_latitude',axis = 1,inplace = True)

#### Converting the dataframe to csv file

In [None]:
nyc_train.to_csv("NYC.csv")

### Training the model using the cleaned dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
nyc_train = pd.read_csv('NYC.csv')
nyc_train.head()

#### Accessing the particular rows and columns in the dataframe using 'iloc' function

In [None]:
x = nyc_train.iloc[:,2:]

In [None]:
y = nyc_train.iloc[:,1]

In [None]:
y.head()

#### Splitting the data into training data and test data 

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size = 0.25,random_state=0)
x_train.head()

#### Creating an Object for the model training

In [None]:
from sklearn.ensemble import RandomForestRegressor
fare = RandomForestRegressor(max_depth= 5)

#### Fit the model

In [None]:
fare.fit(x_train,y_train)

#### Predicting the model based on test data

In [None]:
y_pred = fare.predict(x_test)

#### Calculating the r2_score of the trained model

In [None]:
from sklearn.metrics import r2_score

In [None]:
r2_score(y_test,y_pred)

In [None]:
x_train.iloc[0,:]

In [None]:
x_train.columns

In [None]:
fare.predict([[1,5,1,2010,16,52,16,8.450145]])