In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import tree


from data_cleaning_functions import read_data, encode_label, standardize, remove_outliers
from decision_forest_algorithms import forest_predict

# Scratch
from KNN import KNN as KNN_Scratch

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

In [None]:
data = read_data()

# Preprocessing

In [None]:
# Transformation of revenue that was a boolean into a value. It has transformed true revenue into ones.
encode_label(data)


In [None]:
data = standardize(data)
remove_outliers(data)

In [None]:
X = data.copy()
X = X.drop(['Revenue'], axis=1); #Axis = 0 drops row and Axis=1 drops column
y = data['Revenue'].copy()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state = 211)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
model = RandomForestClassifier(max_depth = 3)
model.fit(X_train, y_train);

In [None]:
print("Test set accuracy random forest: {:.2f}".format(model.score(X_test, y_test)))

In [None]:
plt.figure(figsize=(20,20))
_ = tree.plot_tree(model.estimators_[0], feature_names=X.columns, class_names = ["Revenue", "No Revenue"], filled=True)

from sklearn.model_selection import GridSearchCV
'''
param_grid = {
    'n_estimators' : [60,75,90,115,130],
    'max_depth' : [7,14,21,28],
    'min_samples_leaf' : [1,2,3,4],
    'min_samples_split': [2,4,6,8]
}
gridsearch = GridSearchCV(estimator=rfclf,param_grid=param_grid,verbose = 1)
gridsearch.fit(x,y)
''';

# Random Forest Scratch

In [None]:
#We predict Revenue
data_frame_scaled = data.copy() # make a copy because data_frame_scaled gets modified by the forest algorithms
forest_predict(data_frame_scaled)

# K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

num_neighbors = []
accuracy = []

for i in range(1,12):
    
    
    model = KNeighborsClassifier(n_neighbors=i)
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print("Neighbors: {}, test set accuracy: {}".format(i, acc))
    
    num_neighbors.append(i)
    accuracy.append(acc)
    

In [None]:
plt.scatter(num_neighbors, accuracy)
print("10 Neighbors optimal")

# KNN Scratch

In [None]:
X_train.dtypes

In [None]:
X_test.dtypes

In [None]:
model = KNN_Scratch(k=3)
model.fit(X_train.values, y_train.values)
predictions = model.predict(X_test.values)

run_time = model.end_time - model.start_time


In [None]:
acc = sum(predictions==y_test)/len(predictions)
print('Accuracy from KNN Scratch: ' + str(acc))

In [None]:
print("Run time for knn from scratch: ", run_time, "s")

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train,y_train)

In [None]:
acc = model.score(X_test, y_test)
acc

# Support Vector Machine SVM

In [None]:
from sklearn.svm import SVC
param_grid = {'C':[0.1,0.5,1,50,150],
              'gamma' : [0.1,1,10,100,1000]}

In [None]:
model = SVC()
model.fit(X_train,y_train)
acc = model.score(X_test, y_test)

In [None]:
acc

#### Finding optimal parameters

In [None]:
#param_grid = {'C':[0.1,1,10,100,1000],
#              'kernel':['rbf'],
#              'gamma' : [0.1,1,10,100,1000]}
#gridsearch = GridSearchCV(SVC(),param_grid = param_grid,verbose = 1)
#gridsearch.fit(X_train,y_train)

# Bayesian Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
acc = model.score(X_test, y_test)

In [None]:
acc