In [80]:
import tkinter
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import metrics
from sklearn import datasets
import pandas as pd
import numpy as np
#import pandas_profiling as pp
import re

## Load and Clean Data

In [183]:
# Read in CSV
df = pd.read_csv("final_project.csv")
print(df.shape)

(160000, 51)


In [184]:
# Rename obvious columns
df.rename(columns={'x24': 'continent', 'x29': 'month', 'x30': 'day'}, inplace = True)
#list(df.columns) 

In [185]:
# Correct misspellings and standardize values in labeled columns
df['continent'].replace('euorpe', 'europe',inplace=True)
df['month'].replace('Dev', 'December',inplace=True)
df['month'].replace('Aug', 'August',inplace=True)
df['month'].replace('Jun', 'June',inplace=True)
df['month'].replace('Apr', 'April',inplace=True)
df['month'].replace('Nov', 'November',inplace=True)
df['month'].replace('sept.', 'September',inplace=True)
df['month'].replace('Oct', 'October',inplace=True)
df['month'].replace('Mar', 'March',inplace=True)
df['day'].replace('thurday', 'thursday',inplace=True)

# Fill NA with 'other' in labeled columns
df['continent'] = df['continent'].fillna('other')
df['month'] = df['month'].fillna('other')
df['day'] = df['day'].fillna('other')

# check unique values in labeled columns
print (df['continent'].unique())
print (df['month'].unique())
print (df['day'].unique())

['europe' 'asia' 'america' 'other']
['July' 'August' 'June' 'May' 'September' 'April' 'November' 'October'
 'other' 'March' 'Feb' 'December' 'January']
['tuesday' 'wednesday' 'thursday' 'monday' 'friday' 'other']


In [186]:
# Correct misspellings and standardize values in labeled columns
df['continent'].replace('euorpe', 'europe',inplace=True)
df['month'].replace('Dev', '12',inplace=True)
df['month'].replace('Aug', '8',inplace=True)
df['month'].replace('Jun', '6',inplace=True)
df['month'].replace('Apr', '4',inplace=True)
df['month'].replace('Nov', '11',inplace=True)
df['month'].replace('sept.', '9',inplace=True)
df['month'].replace('Oct', '10',inplace=True)
df['month'].replace('Mar', '3',inplace=True)
df['month'].replace('January', '1',inplace=True)
df['month'].replace('Feb', '2',inplace=True)
df['month'].replace('May', '5',inplace=True)
df['month'].replace('July', '7',inplace=True)
df['month'].replace('December', '12',inplace=True)
df['month'].replace('August', '8',inplace=True)
df['month'].replace('June', '6',inplace=True)
df['month'].replace('April', '4',inplace=True)
df['month'].replace('November', '11',inplace=True)
df['month'].replace('September', '9',inplace=True)
df['month'].replace('October', '10',inplace=True)
df['month'].replace('March', '3',inplace=True)
df['day'].replace('thurday', 'thursday',inplace=True)


# Fill NA with 'other' in labeled columns
df['continent'] = df['continent'].fillna('other')
df['month'] = df['month'].fillna('other')
df['day'] = df['day'].fillna('other')
df['month'].replace('other','0', inplace=True)

# check unique values in labeled columns
print (df['continent'].unique())
print (df['month'].unique())
print (df['day'].unique())

['europe' 'asia' 'america' 'other']
['7' '8' '6' '5' '9' '4' '11' '10' '0' '3' '2' '12' '1']
['tuesday' 'wednesday' 'thursday' 'monday' 'friday' 'other']


In [187]:
# initialize temp for x37 column
temp_x37 = []

# Remove $ ) , characters and replace '(' with '-' 
for i in range (0,len(df)) :
    try :
        n = df['x37'][i]
        nstr = re.sub(r'[$|,|)]',r'', n)
        nstr = re.sub(r'[(]',r'-',nstr)
        #nstr= float(nstr)
        temp_x37.append(nstr)
    except :
        nstr = ''
        temp_x37.append(nstr)

In [188]:
# Verify len of both x37 matches
print(len(df['x37']))
print(len(temp_x37))

# Replace 'x37' with new values and convert to numeric
df['x37'] = temp_x37
df["x37"] = pd.to_numeric(df["x37"])
df['x37']

160000
160000


0         1313.96
1         1962.78
2          430.47
3        -2366.29
4         -620.66
           ...   
159995    -891.96
159996    1588.65
159997     687.46
159998     439.21
159999   -1229.34
Name: x37, Length: 160000, dtype: float64

In [189]:
# Subsetting data set by continent and print length of each
cont = ['asia', 'america','europe', 'other']

for n in cont :
    temp = df['continent'] == n
    df_temp = df[temp]
    #df_[n] = df_temp
    print (n, 'length is', len(df_temp))

# Subsetting by continent    
is_asia = df['continent']=='asia'
df_asia = df[is_asia]

is_europe = df['continent']=='europe'
df_europe = df[is_europe]

is_america = df['continent']=='america'
df_america = df[is_america]

is_other = df['continent']=='other'
df_other = df[is_other]

asia length is 138965
america length is 4469
europe length is 16538
other length is 28


## Explore Data

In [None]:
# Simple and fast exploratory data analysis 
pp.ProfileReport(df_america)

## Logistic Regression

In [190]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [191]:
# Testing with complete data set - Drop continent, x41, x6 per EDA suggestion.
# Need to drop day, month, x32 since it's categorical
lr_df = df.drop(['x41', 'x6', 'continent', 'day','x32'], axis=1)

# Fill in NA with mean - LR needs values in each cell 
lr_df = lr_df.fillna(lr_df.mean())

# Alternative - Drop all rows with NA
lr_df_no = lr_df.dropna()

In [192]:
len(lr_df)

160000

In [193]:
y = lr_df['y']
X = lr_df.drop('y', axis = 1)

# Model Fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [195]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)*100),'%')

Accuracy of logistic regression classifier on test set: 70.27 %


In [196]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[23697  4924]
 [ 9345 10034]]


In [20]:
# Compute Precision, recall, F-Measure and Support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.83      0.77     28621
           1       0.67      0.52      0.58     19379

   micro avg       0.70      0.70      0.70     48000
   macro avg       0.69      0.67      0.68     48000
weighted avg       0.70      0.70      0.69     48000



In [None]:
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Random Forest

In [33]:
# Temp Dataset for Random Forest.  Dropped the highly correlated features.  Replace NaN with mean of column
#rf_df = df.drop(['x41', 'x6'], axis=1)
#rf_df = rf_df.fillna(lr_df.mean())

rf_df = lr_df

In [None]:
# One-hot encode the data using pandas get_dummies
#features = pd.get_dummies(rf_df)

# Display the first 5 rows of the last 12 columns
#features.iloc[:,5:].head(5)

In [200]:
# Feature of Importance
# ref: https://towardsdatascience.com/running-random-forests-inspect-the-feature-importances-with-this-code-2b00dd72b92e

y = lr_df['y']
X = lr_df.drop('y', axis = 1)

# Splitting data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

## Import the random forest model.
from sklearn.ensemble import RandomForestClassifier 

## Initiating Random Forest Classifier. 
rf = RandomForestClassifier() 

## Fitting model on training data.
rf.fit(X_train, y_train) 

## Accuracy Score
rf.score(X_test, y_test)




0.8800625

In [201]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                   columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

       importance
x23      0.080533
x20      0.065573
x48      0.059752
x49      0.059402
x38      0.054917
x42      0.052851
x12      0.051718
x37      0.049289
x27      0.048119
x28      0.045811
x7       0.043381
x40      0.042580
x46      0.041886
x2       0.038460
x18      0.009328
x47      0.009145
x31      0.009071
x3       0.009013
x5       0.008983
x25      0.008943
x11      0.008909
x16      0.008867
x45      0.008860
x43      0.008860
x26      0.008832
x14      0.008819
x39      0.008736
x22      0.008732
x13      0.008723
x21      0.008699
x8       0.008679
x0       0.008610
x36      0.008609
x44      0.008600
x33      0.008574
x4       0.008551
x19      0.008550
x15      0.008536
x10      0.008516
x34      0.008469
x35      0.008445
x17      0.008441
x1       0.008357
x9       0.008327
month    0.003945


## Decision Tree

In [202]:
#ref: https://towardsdatascience.com/decision-tree-in-python-b433ae57fb93
#https://dataaspirant.com/2017/02/01/decision-tree-algorithm-python-with-scikit-learn/
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO 
from IPython.display import Image 
from pydot import graph_from_dot_data
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [203]:
dt = DecisionTreeClassifier(random_state=123)
dt.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [204]:
y_pred = dt.predict(X_test)
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100, 'with Gini Index.')

Accuracy is  83.75416666666666 with Gini Index.


In [205]:
dt_ent = DecisionTreeClassifier(criterion='entropy',random_state=123)
dt_ent.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=123,
            splitter='best')

In [32]:
y_pred = dt_ent.predict(X_test)
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100, 'with Information Gain.')

Accuracy is  84.425 with Information Gain.


## Decision Tree and Logistic Regressions using Features of Importance
Summary: Decision Tree yields the best accuracy at 86.85% using max_depth of 15 and entropy.  Logistic Regression showed tiny improvement from 70.27% to 70.33%.  Recommendation is to go with Decision Tree.

In [169]:
# Subsetting by feature of importance from RF
#rf_df_1 = lr_df [['x23', 'x12', 'x20', 'x48', 'x49', 'x27', 'x28','x37', 'x38', 'x42', 'x2', 'x7' ,'x46', 'x40', 'y']]

# importance > .04
rf_df_1 = lr_df [['x23', 'x20', 'x48', 'x49', 'x38', 'x12', 'x42', 'x27','x40', 'x37','x28','x7','x2', 'x46', 'y']]

#rf_df_1 = lr_df [['x23', 'x20', 'x48', 'x49', 'x38', 'y']] - this set yielded a worse accuracy. importance > 0.05
print(rf_df_1.head())
print(rf_df_1.shape)

         x23       x20       x48        x49        x38        x12       x42  \
0   3.553013 -1.909114  0.151589  -8.040166  -1.353729  25.665413  5.414063   
1  10.590601 -5.809984 -0.320283  16.719974  32.816804 -25.014934  4.490915   
2  -5.270615  1.700321 -2.090804  -7.869421  -0.333199  12.078602  9.088864   
3 -11.484431  1.923670  1.806070  -7.670847  14.188669  10.995330 -7.467775   
4 -15.998166 -9.026317 -0.894942  15.724742 -12.578926 -28.106348 -5.229937   

        x27        x40      x37        x28         x7         x2        x46  y  
0  1.005131 -10.612200  1313.96 -18.473784 -14.789997   4.621113  60.781427  0  
1  0.751086   2.147427  1962.78   3.749377  -6.725709  27.839856  15.805696  0  
2  4.171088  -0.863137   430.47  11.522448  11.060572  12.251561  30.856417  0  
3  9.215569  12.084421 -2366.29  30.595226 -18.913592 -24.149632 -72.424569  0  
4  1.811182  30.004727  -620.66  -4.094084  27.532281 -11.352593 -14.085435  1  
(160000, 15)


### Decision Tree w/ Feature of Importance

In [170]:
#y_1 = rf_df_1['y']
#X_1 = rf_df_1.drop('y', axis = 1)

#rf_df_sample = rf_df.sample(frac=.95)

y_1 = rf_df_1['y']
X_1 = rf_df_1.drop('y', axis = 1)

# Model Fitting
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_1, y_1, test_size=0.3, random_state=123)

In [171]:
dt = DecisionTreeClassifier(random_state=123)
dt.fit(X_train1, y_train1)
y_pred1 = dt.predict(X_test1)
print ("Accuracy is ", accuracy_score(y_test1,y_pred1)*100, 'with Gini Index.')

Accuracy is  85.87708333333333 with Gini Index.


In [172]:
dt_ent = DecisionTreeClassifier(criterion='entropy',random_state=123)
dt_ent.fit(X_train1, y_train1)
y_pred1 = dt_ent.predict(X_test1)
print ("Accuracy is ", accuracy_score(y_test1,y_pred1)*100, 'with Information Gain.')

Accuracy is  86.03541666666666 with Information Gain.


In [173]:
n_est = [5, 10, 11,12,13,14,15, 16, 17,18,19, 20, 22, 25, 50, 100]
for n in n_est :
    dt = DecisionTreeClassifier(random_state=123, max_depth = n)
    dt.fit(X_train1, y_train1)
    y_pred1 = dt.predict(X_test1)
    print ("Accuracy is ", accuracy_score(y_test1,y_pred1)*100, 'with Gini Index at', n, 'depth')

Accuracy is  76.93541666666667 with Gini Index at 5 depth
Accuracy is  84.64791666666667 with Gini Index at 10 depth
Accuracy is  85.25416666666666 with Gini Index at 11 depth
Accuracy is  86.15416666666667 with Gini Index at 12 depth
Accuracy is  86.34791666666666 with Gini Index at 13 depth
Accuracy is  86.65625 with Gini Index at 14 depth
Accuracy is  86.85208333333333 with Gini Index at 15 depth
Accuracy is  86.55000000000001 with Gini Index at 16 depth
Accuracy is  86.69583333333334 with Gini Index at 17 depth
Accuracy is  86.63125000000001 with Gini Index at 18 depth
Accuracy is  86.47083333333333 with Gini Index at 19 depth
Accuracy is  86.41041666666666 with Gini Index at 20 depth
Accuracy is  86.21041666666667 with Gini Index at 22 depth
Accuracy is  85.95208333333333 with Gini Index at 25 depth
Accuracy is  85.87708333333333 with Gini Index at 50 depth
Accuracy is  85.87708333333333 with Gini Index at 100 depth


In [174]:
n_est = [5, 10, 11,12,13,14,15, 16, 17,18,19, 20, 22, 25, 50, 100]
for n in n_est :
    dt = DecisionTreeClassifier(criterion = 'entropy',random_state=123, max_depth = n)
    dt.fit(X_train1, y_train1)
    y_pred1 = dt.predict(X_test1)
    print ("Accuracy is ", accuracy_score(y_test1,y_pred1)*100, 'with Information Gain at', n, 'depth')

Accuracy is  76.02499999999999 with Information Gain at 5 depth
Accuracy is  83.61041666666667 with Information Gain at 10 depth
Accuracy is  84.32916666666667 with Information Gain at 11 depth
Accuracy is  85.00208333333333 with Information Gain at 12 depth
Accuracy is  85.55416666666666 with Information Gain at 13 depth
Accuracy is  85.77291666666666 with Information Gain at 14 depth
Accuracy is  86.38958333333333 with Information Gain at 15 depth
Accuracy is  86.33541666666666 with Information Gain at 16 depth
Accuracy is  86.3 with Information Gain at 17 depth
Accuracy is  86.22916666666667 with Information Gain at 18 depth
Accuracy is  86.25625 with Information Gain at 19 depth
Accuracy is  86.12291666666667 with Information Gain at 20 depth
Accuracy is  86.17291666666667 with Information Gain at 22 depth
Accuracy is  85.99166666666666 with Information Gain at 25 depth
Accuracy is  86.03541666666666 with Information Gain at 50 depth
Accuracy is  86.03541666666666 with Information 

### Logistic Regression w/ Feature of Importance

In [175]:
logreg = LogisticRegression(random_state=123)
logreg.fit(X_train1, y_train1)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [180]:
y_pred1 = logreg.predict(X_test1)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test1, y_test1)*100), '%')

Accuracy of logistic regression classifier on test set: 70.33 %


In [197]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test1, y_pred1)
print(confusion_matrix)

[[23838  4998]
 [ 9243  9921]]


In [198]:
# Compute Precision, recall, F-Measure and Support
from sklearn.metrics import classification_report
print(classification_report(y_test1, y_pred1))

              precision    recall  f1-score   support

           0       0.72      0.83      0.77     28836
           1       0.66      0.52      0.58     19164

   micro avg       0.70      0.70      0.70     48000
   macro avg       0.69      0.67      0.68     48000
weighted avg       0.70      0.70      0.70     48000

