In [1]:
import tkinter
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
#import tensorflow as tf
#from tensorflow.keras import layers
#from tensorflow.keras import metrics
from sklearn import datasets
import pandas as pd
import numpy as np
import pandas_profiling as pp

In [2]:
# Read in CSV
df = pd.read_csv("final_project.csv")

In [None]:
# Simple and fast exploratory data analysis 
pp.ProfileReport(df)

In [3]:
# Drop X41,x6 due to high correlation - and the rest bc it's categorical
lr_df = df.drop(['x41', 'x6', 'x37', 'x30', 'x29', 'x24', 'x32'], axis=1)
lr_df = lr_df.fillna(lr_df.mean())

In [None]:
# Rerun Profile Report
pp.ProfileReport(lr_df)

## LOGISTIC REGRESSION

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

In [5]:
y = lr_df['y']
X = lr_df.drop('y', axis = 1)

# Model Fitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.70


In [7]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

[[23703  4918]
 [ 9351 10028]]


In [8]:
# Compute Precision, recall, F-Measure and Support
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.83      0.77     28621
           1       0.67      0.52      0.58     19379

   micro avg       0.70      0.70      0.70     48000
   macro avg       0.69      0.67      0.68     48000
weighted avg       0.70      0.70      0.69     48000



In [None]:
# ROC Curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

## Random Forest

In [9]:
# Temp Dataset for Random Forest.  Dropped the highly correlated features.  Replace NaN with mean of column
#rf_df = df.drop(['x41', 'x6'], axis=1)
#rf_df = rf_df.fillna(lr_df.mean())

rf_df = lr_df

In [None]:
# One-hot encode the data using pandas get_dummies
#features = pd.get_dummies(rf_df)

# Display the first 5 rows of the last 12 columns
#features.iloc[:,5:].head(5)

In [10]:
# Labels are the values we want to predict
labels = np.array(rf_df['y'])

# Remove the labels from the features
# axis 1 refers to the columns
features = rf_df.drop('y', axis = 1)

# Saving feature names for later use
feature_list = list(features.columns)

# Convert to numpy array
features = np.array(features)

In [11]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [12]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (120000, 43)
Training Labels Shape: (120000,)
Testing Features Shape: (40000, 43)
Testing Labels Shape: (40000,)


In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

In [None]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

## Dimensionality Reduction - PCA

In [None]:
from sklearn.preprocessing import StandardScaler

x = lr_df.drop('y', axis = 1)
y = lr_df['y'].values

# Standardizing the features
#x = StandardScaler().fit_transform(x)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, lr_df[['y']]], axis = 1)

In [None]:
x