In [2]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
import sklearn.metrics as metrics
from scipy.stats import sem

# Classifiers/Regressors
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression

In [3]:
df = pd.read_csv('dataset/clean/cleaned_data3.csv', nrows=2000000)
df.head()

Unnamed: 0,passenger_count,trip_distance,fare_amount,trip_time,day_of_week,pickup_hour,month,year,season,tip_percentage,PRCP,SNWD,SNOW,TMAX,TMIN,pickup_area,dropoff_area
0,1,6.11,18.0,12,2,0,7,0,2,0,1,0,0,82,69,848420,882916
1,1,0.96,5.0,3,2,0,7,0,2,0,1,0,0,82,69,693643,688712
2,1,4.17,15.0,15,2,0,7,0,2,5,1,0,0,82,69,637025,585371
3,2,1.45,7.0,6,2,0,7,0,2,3,1,0,0,82,69,860706,902497
4,1,0.55,5.0,5,2,0,7,0,2,6,1,0,0,82,69,868097,868097


In [4]:
df.tip_percentage.value_counts()

3    903047
0    300004
4    294245
2    179600
5    145261
1    116592
6     38306
7     20341
8      2604
Name: tip_percentage, dtype: int64

In [5]:
def change_tip_grouping(row):
    # 0 is less than 20% and 1 is greater than 20%
    return int(row.tip_percentage >= 4)

df.tip_percentage = df.apply(change_tip_grouping, axis=1)

In [6]:
df.tip_percentage.value_counts()

0    1499243
1     500757
Name: tip_percentage, dtype: int64

In [8]:
features = ['pickup_area', 'dropoff_area', \
            'passenger_count', 'trip_distance', 'fare_amount', 'trip_time', 'day_of_week', \
            'pickup_hour', 'month', 'season', 'PRCP', 'SNWD', 'SNOW', 'TMAX', 'TMIN']
target = 'tip_percentage'

x = df[features]
y = df[target]
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)

In [19]:
# Metrics

# Train and gather reporting data for Classification
def train_and_evaluate_classification(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test) # Convert to np.array??? maybe thats the issue with accuracy score
    
    # print "prediction:", y_pred
    # print "truth_____:", y_test
    
    print "Accuracy on training set:", clf.score(X_train, y_train)
    print "Accuracy on testing set_:", clf.score(X_test, y_test)
    
    print "Classification Report:"
    print metrics.classification_report(y_test, y_pred)
    print "Confusion Matrix:"
    print metrics.confusion_matrix(y_test, y_pred)
    
# Train and gather reporting data for Regression
def train_and_evaluate_regression(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    # Explained variance score: 1 is perfect prediction
    print 'Variance score: %.2f' % clf.score(X_test, y_test)
    # The mean square error
    print 'Residual sum of squares: %.2f' % np.mean((clf.predict(X_test) - y_test) ** 2)

    # Mean Absolute Error
    print 'Mean Absolute Error: %.2f' % metrics.mean_absolute_error(y_test, y_pred)
    # Mean Squared Error
    print 'Mean Squared Error: %.2f' % metrics.mean_squared_error(y_test, y_pred)
    # Root Mean Squared Error
    print 'Root Mean Squared Error: %.2f' % np.sqrt(metrics.mean_squared_error(y_test, y_pred))    

# Perform 5 fold cross validation
def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)
    scores = cross_val_score(clf, X, y, cv=cv)
    
    print "5 cross validation"
    print scores
    print ("Mean score: {0:.3f} (+/-{1:.3f})").format(np.mean(scores), sem(scores))
    
    
# Perfrom analysis on the classifer
def perform_classification_analysis(clf):    
    train_and_evaluate_classification(clf, x_train, x_test, y_train, y_test)
    evaluate_cross_validation(clf, x, y, 5)

def perform_regression_analysis(clf):
    train_and_evaluate_regression(clf, x_train, x_test, y_train, y_test)
    evaluate_cross_validation(clf, x, y, 5)

In [13]:
# RANDOM FOREST CLASSIFIER
forest = RandomForestClassifier(n_estimators=50)

perform_classification_analysis(forest)

Accuracy on training set: 0.999661333333
Accuracy on testing set_: 0.755232
Classification Report:
             precision    recall  f1-score   support

          0       0.77      0.97      0.86    375005
          1       0.55      0.12      0.20    124995

avg / total       0.71      0.76      0.69    500000

Confusion Matrix:
[[362375  12630]
 [109754  15241]]
5 cross validation
[ 0.755385  0.754005  0.75461   0.75521   0.754105]
Mean score: 0.755 (+/-0.000)


In [18]:
# DECISION TREE CLASSIFIER
dt = sklearn.tree.DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=1000)

perform_classification_analysis(dt)

Accuracy on training set: 0.765200666667
Accuracy on testing set_: 0.76339
Classification Report:
             precision    recall  f1-score   support

          0       0.77      0.97      0.86    375005
          1       0.60      0.16      0.25    124995

avg / total       0.73      0.76      0.71    500000

Confusion Matrix:
[[362268  12737]
 [105568  19427]]
5 cross validation
[ 0.7642725  0.76297    0.763835   0.7631     0.763465 ]
Mean score: 0.764 (+/-0.000)


In [15]:
# K NEAREST NEIGHBORS CLASSIFIER
knnclf = KNeighborsClassifier(n_neighbors=20, weights='uniform')

perform_classification_analysis(knnclf)

Accuracy on training set: 0.756902
Accuracy on testing set_: 0.747704
Classification Report:
             precision    recall  f1-score   support

          0       0.76      0.98      0.85    375005
          1       0.47      0.06      0.11    124995

avg / total       0.68      0.75      0.67    500000

Confusion Matrix:
[[366054   8951]
 [117197   7798]]
5 cross validation
[ 0.7479925  0.7466725  0.7470375  0.7469425  0.746235 ]
Mean score: 0.747 (+/-0.000)


In [16]:
# NAIVE BAYES CLASSIFIER
bayes = GaussianNB()

perform_classification_analysis(bayes)

Accuracy on training set: 0.747794
Accuracy on testing set_: 0.748254
Classification Report:
             precision    recall  f1-score   support

          0       0.75      1.00      0.86    375005
          1       0.26      0.00      0.01    124995

avg / total       0.63      0.75      0.64    500000

Confusion Matrix:
[[373638   1367]
 [124506    489]]
5 cross validation
[ 0.7485275  0.747995   0.7478725  0.7486525  0.7471675]
Mean score: 0.748 (+/-0.000)


In [20]:
# LINEAR REGRESSION
linreg = LinearRegression()

perform_regression_analysis(linreg)

Variance score: 0.01
Residual sum of squares: 0.19
Mean Absolute Error: 0.37
Mean Squared Error: 0.19
Root Mean Squared Error: 0.43
5 cross validation
[ 0.0051262   0.00521547  0.0052033   0.00493357  0.00525286]
Mean score: 0.005 (+/-0.000)


In [21]:
# KNN REGRESSION
knnreg = KNeighborsRegressor(n_neighbors=20, weights='uniform')

perform_regression_analysis(knnreg)

Variance score: 0.00
Residual sum of squares: 0.19
Mean Absolute Error: 0.36
Mean Squared Error: 0.19
Root Mean Squared Error: 0.43
5 cross validation
[ 0.00340432  0.0023052   0.00173741  0.00164286  0.00162464]
Mean score: 0.002 (+/-0.000)
