In [14]:
# import needed libraries
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to
import numpy as np
import datetime as dt # handle datetime
import matplotlib,math
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb
import sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.isotonic import IsotonicRegression
from sklearn.preprocessing import scale 
from collections import Counter 
from matplotlib.collections import LineCollection
from sklearn.utils import check_random_state
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.datasets import make_regression

In [15]:
#read the first few lines of the training set, prepare testing set for submission, and validation set for make shift test  
# Set the number of rows for training here:
training_size = 5000000

train_df =  pd.read_csv('../all/train.csv',nrows = training_size) #100,000 # 5000000 data points
test_df = pd.read_csv('../all/test.csv')
validation_df = pd.read_csv('../all/train.csv',skiprows=training_size,nrows=len(test_df))
validation_df.columns = ["key","fare_amount","pickup_datetime","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","passenger_count"]

# change 'pickup_datetime' to pandas DF for future processing
train_df['pickup_datetime'] = pd.to_datetime(train_df.pickup_datetime)
test_df['pickup_datetime'] = pd.to_datetime(test_df.pickup_datetime)
validation_df['pickup_datetime'] = pd.to_datetime(validation_df.pickup_datetime)

In [16]:
# create new features from the provided dataframe
def feature_generator(df):
    df['year'] = df.pickup_datetime.dt.year
    df['month'] = df.pickup_datetime.dt.month
    df['day'] = df.pickup_datetime.dt.day
    df['hour'] = df.pickup_datetime.dt.hour
    df['minute'] = df.pickup_datetime.dt.minute
    df['week'] = df.pickup_datetime.dt.week
    df['weekofyear'] = df.pickup_datetime.dt.weekofyear
    df['dayofyear'] = df.pickup_datetime.dt.dayofyear
    df['quarter'] = df.pickup_datetime.dt.quarter
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
    df['initial_fair'] = 3.5 # add a column for initial_fair

# Update the train dataframe with the new features
feature_generator(train_df)

# create new features from the provided dataframe
def normalize_features(df):
    df['month'] = (df['month'] - df['month'].mean()) / (df['month'].max() - df['month'].min())
    df['day'] = (df['day'] - df['day'].mean()) / (df['day'].max() - df['day'].min())
    df['hour'] = (df['hour'] - df['hour'].mean()) / (df['hour'].max() - df['hour'].min())
    df['minute'] = (df['minute'] - df['minute'].mean()) / (df['minute'].max() - df['minute'].min())  
    df['abs_diff_longitude'] = (df['abs_diff_longitude'] - df['abs_diff_longitude'].mean()) / (df['abs_diff_longitude'].max() - df['abs_diff_longitude'].min())
    df['abs_diff_latitude'] = (df['abs_diff_latitude'] - df['abs_diff_latitude'].mean()) / (df['abs_diff_latitude'].max() - df['abs_diff_latitude'].min())

# Update the train dataframe with the new features
feature_generator(train_df)

In [17]:
### clean the data: ###
train_df.isnull().sum()
train_df.dropna(axis=0, how='any', inplace=True)
train_df.passenger_count.value_counts()
train_df = train_df[(train_df.passenger_count>0)&(train_df.passenger_count<10)]
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
train_df.reset_index(drop=True, inplace=True)

# Outlier elimination: 

outliers = []
# For each feature find the data points with extreme high or low values
for feature in ['abs_diff_longitude','abs_diff_latitude']:
    Q1 = np.percentile(train_df[feature],25,axis=0)
    Q3 = np.percentile(train_df[feature],75,axis=0)
    step = 10*(Q3-Q1)
    feature_outlier = train_df[~((train_df[feature] >= Q1 - step) & (train_df[feature] <= Q3 + step))]
    outliers += feature_outlier.index.tolist()


# Drop outliers
train_df = train_df.drop(train_df.index[outliers]).reset_index(drop = True)

# drop unwanted features now after pre-processing: 
# drop the pickup_datetime as it is no longer needed
train_df=train_df.drop(['pickup_datetime'],axis=1)
# drop the key as it is no longer needed
train_df=train_df.drop(['key'],axis=1)

# normalize features
normalize_features(train_df)

In [18]:
# create predictors feature set and the target outcome:
RegPredictors = train_df.iloc[:,1:18].values # Predictors are simply the features
RegTarget = train_df.iloc[:,0].values # Target is the fare amount

# create testing features and load test_RegPredictors
feature_generator(test_df)
normalize_features(test_df)
test_RegPredictors = test_df.iloc[:,2:19].values

# prepare validation set and make it look like and load test_RegPredictors
# save fare amount for validation
validation_fare_amount = validation_df.iloc[:,1].values
validation_df=validation_df.drop(['fare_amount'],axis=1)
feature_generator(validation_df)
normalize_features(validation_df)
validation_RegPredictors = test_df.iloc[:,2:19].values

In [19]:
# # test regression models here: 

# SVR_Reg = SVR(C=1.0, epsilon=0.2)
# SVR_Reg.fit(RegPredictors, RegTarget)
# RegTarget_predictions = SVR_Reg.predict(test_RegPredictors).round(decimals = 2)

RF_Reg = RandomForestRegressor(n_jobs=-1)
RF_Reg.fit(RegPredictors, RegTarget)
RegTarget_predictions = RF_Reg.predict(test_RegPredictors)
m = RF_Reg

# regr = PassiveAggressiveRegressor()
# regr.fit(RegPredictors, RegTarget)
# PassiveAggressiveRegressor(C=1.0, average=False, epsilon=0.1,fit_intercept=True, loss='epsilon_insensitive',max_iter=10000, n_iter=None, random_state=0, shuffle=True,tol=None, verbose=0, warm_start=True)
# RegTarget_predictions = regr.predict(test_RegPredictors)
# m = regr


In [20]:
# define RMSE for testing
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(RegPredictors), RegTarget), rmse(m.predict(validation_RegPredictors), validation_fare_amount),
                m.score(RegPredictors, RegTarget), m.score(validation_RegPredictors, validation_fare_amount)]
    print(res)

print_score(m)

[1.8014543651386732, 10.76678118192561, 0.960093678798785, -0.1792379658135923]


In [21]:
# [4.619374837151818, 13.321280085846855, 0.737601364373413, -0.8051845116119909] 3.71
# [1.8171518549456913, 13.21201823904891, 0.9603583437562129, -0.934987861504124] 3.85
# [1.9403859091404618, 13.084358576159007, 0.9547992571764596, -0.8977753035973798] 4.084
# [9.49306218252571, 9.844027629785927, -0.08188724075898235, -0.07420102842762133] 3.92
# [10.279379265104573, 11.238026320440355, -0.29935835655106113, -0.2847237986374469] 9.80594
# [9.785665834727306, 10.416795439651933, -0.14336128593079978, -0.15707856825155386] 0.08105
# [1.8014543651386732, 10.76678118192561, 0.960093678798785, -0.1792379658135923] 6.89398

In [22]:
# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': RegTarget_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('aLogictest_RFsubmission.csv', index = False)

print(os.listdir('.'))

['.DS_Store', 'test.csv', 'submission.csv', 'code.py', 'aLogictest_RFsubmission.csv', '__pycache__', 'test.py', 'GCP-Coupons-Instructions.rtf', 'test_RFsubmission.csv', 'train.csv', '.ipynb_checkpoints', 'Taxi.ipynb', 'sample_submission.csv', 'DataClean.ipynb']
