# Data Science Challenge - Model Creation
By Shyam Balagurumurthy Viswanathan

## Q4 - Build a predictive model for tip as a percentage of the total fare

As part of the challenge, we will use data collected by the New York City Taxi and Limousine commission about "Green" Taxis. We are using NYC Taxi and Limousine trip record data: (http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml). We will build a model to pre

We will build a model to predict the 'Tip_percentage' provided on each trip.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import Imputer, StandardScaler
from sklearn.pipeline import Pipeline
import dill
import os
import geopy.distance as geo
from sklearn.metrics import mean_squared_error, r2_score
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import ElasticNet
import json
import ast

In [2]:
def calculate_distance(x):
    """Return great circle distance of pickup and dropoff point"""
    return(geo.great_circle((x.Pickup_latitude,x.Pickup_longitude),(x.Dropoff_latitude,x.Dropoff_longitude)).miles)

In [3]:
class DataPrep(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y =None):
        
        return self
    
    def transform(self, X):
        """Perform Feature Engineering and handle NAN"""
        # Drop the column as it is NA
        X = pd.DataFrame(X.drop('Ehail_fee',axis=1))
        
        X['dates'] = pd.to_datetime(X.lpep_pickup_datetime)
        # Day of ride
        X['day'] = X['dates'].dt.day
        # Hour of ride
        X['hour'] = X['dates'].dt.hour
        # Day of week ride
        X['dayofweek'] = X['dates'].dt.dayofweek
        
        # Convet to categorical variables and convert missing values to NA
        X.VendorID = X.VendorID.astype('category',categories=[1, 2],ordered=False)
        X.RateCodeID = X.RateCodeID.astype('category',categories=[1, 2, 3, 4, 5, 6],ordered=False)
        X.Store_and_fwd_flag = X.Store_and_fwd_flag.astype('category',categories=['Y','N'],ordered=False)
        X.Payment_type = X.Payment_type.astype('category',categories=[1, 2, 3, 4, 5, 6],ordered=False)
        X.Trip_type = X.Trip_type.astype('category',categories=[1, 2],ordered=False)
                       
        # Drop uncessary columns 
        X.drop(configstore['drop_cols'],axis=1,inplace=True)
     
        #print(X.info())
        print("Data preparation completed successfully")
        
        return X


In [4]:
class DataFrameImputer(TransformerMixin):
    
    def __init__(self):
        """Perform Imputation for categorical variables"""

        self.model = Imputer(strategy='median')
    
    def fit(self, X, y=None):
        #Fit the model on categorical missing cols
        self.model.fit(X[configstore['categorical_cols']])
        return self 

    def transform(self, X, y=None):
        #Transform the model and convert it to dummies
        X[configstore['categorical_cols']]= self.model.transform(X[configstore['categorical_cols']])
        X_new = pd.get_dummies(X,drop_first=True,prefix_sep='_',columns=['VendorID','Store_and_fwd_flag','RateCodeID','Payment_type','Trip_type'])
        
        #print(X_new.info())
        print("Data Imputation on categorical variables completed successfully")
        return X_new


In [5]:
# Main function which loads the datasets and calls other functions
if __name__=="__main__":
    
    warnings.filterwarnings('ignore')
    #Get year, month 
    year = str(input("Enter the year(4 digits): ")).zfill(4)
    month = str(input("Enter the month number: ")).zfill(2)
    
    url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_'+year+'-'+month+'.csv'

    try:
        df = pd.read_csv(url,sep=',')
        
    except:
        print("Error in inputs. Downloading default dataset of year 2015 and September month.")
        df = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2015-09.csv',sep=',')
    
    with open('./config/config.json','r') as f:
        configstore =  json.load(f)
    
    print("Dataset and Config loaded successfully")
    
    try:
        df.columns = configstore['all_cols']

        tip =((df['Tip_amount']/df['Total_amount']).round(2))*100
        tip[tip.isna()]=0

        df['Tip_percent']= tip

        # Split the data between Train and test dataset. Have given a test size of 20%. Random split, or datewise split

        if ast.literal_eval(configstore['normal_split']):
            X_train, X_test, y_train, y_test = train_test_split(df.drop(['Tip_percent'], axis=1), df['Tip_percent'],test_size=configstore['split_size'], random_state=40)
        else:
            # Split around 20% of data
            n = df.shape[0]
            train_size= configstore['split_size']
            X_train, X_test, y_train, y_test = df.drop(['Tip_percent'], axis=1).iloc[:int(n*train_size)] , df.drop(['Tip_percent'], axis=1).iloc[int(n*train_size):], \
                                            df['Tip_percent'][:int(n*train_size)], df['Tip_percent'][int(n*train_size):]


        print("Train/Test split performed successfully")

        X_train['Tip_percent'] = y_train    
        print("Tip percentage calculated")


        if ast.literal_eval(configstore['less_totals']):
            #Totals less than 0 are removed. Tip % amount cant be calculated
            X_train = X_train[X_train.Total_amount >0]

        if ast.literal_eval(configstore['less_distance']):
            #Distance less than or equal to 0 are removed. Need valid distance for a trip
            X_train = X_train[X_train.Trip_distance > 0]

        if ast.literal_eval(configstore['outliers']):
            # Distance calculated from latitude/long for trip distance equal to 0
            new_trip_distance = pd.Series(round(X_train[np.logical_and(X_train.Trip_distance == 0,~ np.logical_or(X_train.Pickup_longitude == 0,X_train.Dropoff_longitude == 0))]\
                .apply(calculate_distance,axis=1),2),name='Trip_distance')
            X_train.update(new_trip_distance)

        if ast.literal_eval(configstore['correct_distance']):
            #Removing the outliers which are wrong data for this dataset
            try:
                X_train.drop([X_train[X_train['Trip_distance'] >= mean + 100*sd].index.values[0]],inplace=True)
                print('Records more than thresold miles are dropped')
            except:
                print('No records more than threshold miles')


        # Perfrom split after cleaning the Training dataset
        y_train = X_train['Tip_percent']
        X_train.drop(['Tip_percent'],axis=1,inplace=True)

        # Steps to be performed for pipeline
        steps = [('clean_dataset',DataPrep()),('imputing',DataFrameImputer()),\
                 ('scaler',StandardScaler()),#()
                ('regr',RandomForestRegressor())]

        # Grid search parameters
        parameters = [{'regr':[RandomForestRegressor()], 'regr__n_estimators':[30],'regr__random_state':[40]}]

        # Sklearn pipeline
        pl = Pipeline(steps)

        # Gridsearch CV on the dataset not used due to memory limitations
        #model = GridSearchCV(pl,parameters,cv=configstore['CV'],n_jobs=-1)
        #model.fit(X_train,y_train)

        # Fitting the model using pipeline     
        pl.fit(X_train,y_train)

        # Performing prediction using the pipeline
        y_pred = pl.predict(X_test)

        #Store the model for future usage
        if not os.path.exists('./generated_files'):
            os.makedirs('./generated_files')

        with open("./generated_files/model.pkl",'wb') as model_file:
            dill.dump(pl,model_file)
        print("Model output file stored as Pickle!")
        print("Mean Squared Error: {}".format(mean_squared_error(y_test,y_pred)))
        print("R-squared: {}".format(r2_score(y_test,y_pred)))
    except:
        print("Sorry, error while execution.")


Enter the year(4 digits): 2015
Enter the month number: 9
Dataset and Config loaded successfully
Train/Test split performed successfully
Tip percentage calculated
No records more than threshold miles
Data preparation completed successfully
Data Imputation on categorical variables completed successfully
Data preparation completed successfully
Data Imputation on categorical variables completed successfully
Model output file stored as Pickle!
Mean Squared Error: 0.01212023305439051
R-squared: 0.9998478634304127


### Summary of Question 4:

1. Derived variable `tip_percentage` has been build and used as a dependent variable.
2. Sklearn pipeline has been created and perfomed all the data cleaning/transforming inside it.
3. Pipeline predictive model is created using sklearn algorithms.
4. Pipeline is also created in a way to use different algorithms and run CV.
5. As the dependent variable is derived, the model provides higher weight to some of the independent variables.
6. Better way to model is to remove tip and total amount from the dataset after calculating the tip. This can be easily perfomed in the above coding.
7. Code has been completley modularized. It contains a config file where all the model parameters can be changed without changing the code.
8. As the dependet variable is derived, model provides very good performance.  