# Reading data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.shape

(1458644, 11)

In [4]:
data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [6]:
data.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

# Understand Data With Descriptive Statistics

In [5]:
types = data.dtypes
print(types)

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object


In [9]:
#statistical summary
pd.set_option("display.width", 130)
pd.set_option('precision', 3)
description = data.describe()
print(description)

       vendor_id  passenger_count  pickup_longitude  pickup_latitude  dropoff_longitude  dropoff_latitude  trip_duration
count  1.459e+06        1.459e+06         1.459e+06        1.459e+06          1.459e+06         1.459e+06      1.459e+06
mean   1.535e+00        1.665e+00        -7.397e+01        4.075e+01         -7.397e+01         4.075e+01      9.595e+02
std    4.988e-01        1.314e+00         7.090e-02        3.288e-02          7.064e-02         3.589e-02      5.237e+03
min    1.000e+00        0.000e+00        -1.219e+02        3.436e+01         -1.219e+02         3.218e+01      1.000e+00
25%    1.000e+00        1.000e+00        -7.399e+01        4.074e+01         -7.399e+01         4.074e+01      3.970e+02
50%    2.000e+00        1.000e+00        -7.398e+01        4.075e+01         -7.398e+01         4.075e+01      6.620e+02
75%    2.000e+00        2.000e+00        -7.397e+01        4.077e+01         -7.396e+01         4.077e+01      1.075e+03
max    2.000e+00        9.000e+0

## remove unwanted columns from the dataframe

In [14]:
fields_to_drop = ['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime']
data_without_unwanted_fields = data.drop(fields_to_drop, axis=1)

In [15]:
data_without_unwanted_fields.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

## change the 'store_and_fwd_flag' field values from Y and N to 1 and 0

In [29]:
data_without_unwanted_fields['store_and_fwd_flag'] = data_without_unwanted_fields['store_and_fwd_flag'].map({'Y': 1, 'N': 0})

In [30]:
data_without_unwanted_fields['store_and_fwd_flag'][0:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: store_and_fwd_flag, dtype: int64

## convert the values of data frame into numpy array to make it work better with the model

In [31]:
import numpy as np

In [32]:
array = data_without_unwanted_fields.values

In [33]:
array.shape

(1458644, 7)

## separate data into input and output components

In [36]:
X = array[:,0:6]
Y = array[:,6]

In [37]:
print(X.shape)
print(Y.shape)
print(type(X))
print(type(Y))

(1458644, 6)
(1458644,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## rescale data between 0 and 1

In [38]:
from sklearn.preprocessing import MinMaxScaler

In [39]:
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(X)

In [41]:
print(rescaledX[:9])

[[0.11111111 0.79130228 0.3657382  0.79159134 0.73122178 0.        ]
 [0.11111111 0.79133098 0.36406178 0.79101622 0.72828729 0.        ]
 [0.11111111 0.7913539  0.36551003 0.79091965 0.726493   0.        ]
 [0.11111111 0.7908421  0.36300063 0.79080521 0.72620608 0.        ]
 [0.11111111 0.79145248 0.36718057 0.79145449 0.73266286 0.        ]
 [0.66666667 0.79129069 0.36426905 0.79113835 0.72982326 0.        ]
 [0.44444444 0.79151908 0.3651619  0.79171057 0.7312468  0.        ]
 [0.11111111 0.7915148  0.3674414  0.79228708 0.73079221 0.        ]
 [0.11111111 0.79101635 0.36405242 0.79124222 0.72842896 0.        ]]


## standardize data (0 mean, 1 stdev)

In [42]:
from sklearn.preprocessing import StandardScaler

In [43]:
std_scaler = StandardScaler().fit(rescaledX)
standardizedX = std_scaler.transform(rescaledX)

In [45]:
print(standardizedX[0:5])

[[-0.50563722 -0.12226117  0.51749362  0.12436886  0.38457476 -0.07447137]
 [-0.50563722 -0.09772722 -0.37581901 -0.36897023 -0.57530291 -0.07447137]
 [-0.50563722 -0.07814311  0.39591029 -0.45180541 -1.16221959 -0.07447137]
 [-0.50563722 -0.51555758 -0.94127431 -0.54997644 -1.25607098 -0.07447137]
 [-0.50563722  0.00611164  1.28609052  0.00697403  0.85595749 -0.07447137]]


## normalize data:
#### Normalizing in scikit-learn refers to rescaling each observation (row) to have a length of 1 (called a unit norm or a vector with the length of 1 in linear algebra).

In [49]:
from sklearn.preprocessing import Normalizer

In [50]:
norm_scaler = Normalizer().fit(standardizedX)
normalizedX = norm_scaler.transform(standardizedX)

In [51]:
print(normalizedX[0:5])

[[-0.60121343 -0.14537114  0.61531095  0.14787723  0.45726759 -0.08854805]
 [-0.53928141 -0.10422982 -0.40082534 -0.39352085 -0.61358253 -0.07942656]
 [-0.35943615 -0.05554864  0.28143591 -0.32116939 -0.82617284 -0.05293855]
 [-0.27863134 -0.28409795 -0.51868911 -0.30306446 -0.69215779 -0.04103744]
 [-0.31072816  0.00375577  0.79033847  0.00428574  0.52600973 -0.04576473]]


## run algorithm

In [52]:
#imports
from sklearn.linear_model import LinearRegression

In [54]:
# Create linear regression object
regr = LinearRegression()

In [57]:
# Train the model using the training sets
import time
start_time = time.time()
regr.fit(normalizedX, Y)
print("Time taken to train the model: {0} seconds".format(time.time() - start_time))

Time taken to train the model: 0.3767664432525635 seconds


In [72]:
# get test data

def prepare_test_data():
    test_data = pd.read_csv("test.csv")
    fields_to_drop = ['id', 'vendor_id', 'pickup_datetime']
    test_data_without_unwanted_fields = test_data.drop(fields_to_drop, axis=1)
    test_data_without_unwanted_fields['store_and_fwd_flag'] = test_data_without_unwanted_fields['store_and_fwd_flag'].map({'Y': 1, 'N': 0})
    array = test_data_without_unwanted_fields.values

    X = array[:,0:6]
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0,1))
    rescaledX = scaler.fit_transform(X)

    from sklearn.preprocessing import StandardScaler
    std_scaler = StandardScaler().fit(rescaledX)
    standardizedX = std_scaler.transform(rescaledX)

    from sklearn.preprocessing import Normalizer
    norm_scaler = Normalizer().fit(standardizedX)
    normalizedX = norm_scaler.transform(standardizedX)
    
    return normalizedX

In [73]:
test_data = prepare_test_data()

In [74]:
# Make predictions using the testing set
y_pred = regr.predict(test_data)

In [76]:
print('Coefficients: \n', regr.coef_)

Coefficients: 
 [  79.23224744  803.86679351 -320.8432865   391.29797885 -193.50972533
  400.72556269]


In [77]:
print(len(y_pred))
print(y_pred[:10])

625134
[ 934.86327628 1479.10705546  960.04265103 1043.83981884  938.21584609
  673.07850427 1336.16516267 1196.11627718 1061.09054555  562.03547077]


# submission

In [78]:
submission_file = pd.read_csv('sample_submission_test.csv')

In [79]:
submission_file.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [82]:
submission_file['trip_duration'] = y_pred

In [84]:
submission_file['trip_duration'][:10]

0     934.863
1    1479.107
2     960.043
3    1043.840
4     938.216
5     673.079
6    1336.165
7    1196.116
8    1061.091
9     562.035
Name: trip_duration, dtype: float64

In [87]:
submission_file.to_csv('AmanpreetSingh1.csv', index=False)