In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.svm import SVC
from xgboost import plot_importance


In [None]:
#Loading the data
data = pd.read_csv('./dataset_30.csv')
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
data.info()

In [None]:
# Data distribution
data.hist(bins = 50,figsize = (15,15))
plt.show()

In [None]:
# Correlation heatmap

plt.figure(figsize=(15,13))
sns.heatmap(data.corr())
plt.show()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data2 = data.sample(frac=1).reset_index(drop=True)
data2.head()

In [None]:
y = data2['Prediction']
X = data2.drop('Prediction',axis=1)
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size = 0.8, random_state = 12)
X_train.shape, X_test.shape 

In [None]:
ML_Model = []
acc_train = []
acc_test = []

def storeResults(model, a,b):
  ML_Model.append(model)
  acc_train.append(round(a, 3))
  acc_test.append(round(b, 3))

In [None]:
# Decision Tree model 

tree = DecisionTreeClassifier(max_depth = 5)
tree.fit(X_train, y_train)

y_test_tree = tree.predict(X_test)
y_train_tree = tree.predict(X_train)

acc_train_tree = accuracy_score(y_train,y_train_tree)
acc_test_tree = accuracy_score(y_test,y_test_tree)

storeResults('Decision Tree', acc_train_tree, acc_test_tree)

print("Decision Tree: Accuracy on training Data: {:.3f}".format(acc_train_tree))
print("Decision Tree: Accuracy on test Data: {:.3f}".format(acc_test_tree))

In [None]:
# Feature improtance in Decision Tree

plt.figure(figsize=(9,7))
n_features = X_train.shape[1]
plt.barh(range(n_features), tree.feature_importances_)
plt.yticks(np.arange(n_features))
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()

In [None]:
# Random Forest model

forest = RandomForestClassifier(max_depth=5)
forest.fit(X_train, y_train)

y_test_forest = forest.predict(X_test)
y_train_forest = forest.predict(X_train)

acc_train_forest = accuracy_score(y_train,y_train_forest)
acc_test_forest = accuracy_score(y_test,y_test_forest)

storeResults('Random Forest', acc_train_forest, acc_test_forest)

print("Random forest: Accuracy on training Data: {:.3f}".format(acc_train_forest))
print("Random forest: Accuracy on test Data: {:.3f}".format(acc_test_forest))

In [None]:
# Feature improtance in Random Forest

plt.figure(figsize=(9,7))
n_features = X_train.shape[1]
plt.barh(range(n_features), forest.feature_importances_)
plt.yticks(np.arange(n_features))
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plt.show()

In [None]:
#K Nearest Neighbour Classification

knn = KNeighborsClassifier(n_neighbors =1)
knn.fit(X_train,np.ravel(y_train,order='C'))

y_predict= knn.predict(X_test)

y_test_knn = knn.predict(X_test)
y_train_knn = knn.predict(X_train)
model_score=knn.score(X_test, y_test)

acc_train_knn = accuracy_score(y_train,y_train_knn)
acc_test_knn = accuracy_score(y_test,y_test_knn)

print("KNeighborsClassifier: Accuracy on the Model: ",model_score)
storeResults('KNeighborsClassifier', acc_train_knn, acc_test_knn)

print("KNeighborsClassifier: Accuracy on training Data: {:.3f}".format(acc_train_knn))
print("KNeighborsClassifier: Accuracy on test Data: {:.3f}".format(acc_test_knn))
print(metrics.classification_report(y_test, y_predict))
print(metrics.confusion_matrix(y_test, y_predict))

In [None]:
# XGBoost Classification model

xgb = XGBClassifier(learning_rate=0.4,max_depth=7)
xgb.fit(X_train, y_train)

y_test_xgb = xgb.predict(X_test)
y_train_xgb = xgb.predict(X_train)

acc_train_xgb = accuracy_score(y_train,y_train_xgb)
acc_test_xgb = accuracy_score(y_test,y_test_xgb)

storeResults('XGBoost', acc_train_xgb, acc_test_xgb)

print("XGBoost: Accuracy on training Data: {:.3f}".format(acc_train_xgb))
print("XGBoost : Accuracy on test Data: {:.3f}".format(acc_test_xgb))

In [None]:
plot_importance(xgb)
plt.show()

In [None]:
# Support vector machine model

svm = SVC(kernel='linear', C=1.0, random_state=12)
svm.fit(X_train, y_train)

y_test_svm = svm.predict(X_test)
y_train_svm = svm.predict(X_train)

acc_train_svm = accuracy_score(y_train,y_train_svm)
acc_test_svm = accuracy_score(y_test,y_test_svm)

storeResults('SVM', acc_train_svm, acc_test_svm)

print("SVM: Accuracy on training Data: {:.3f}".format(acc_train_svm))
print("SVM : Accuracy on test Data: {:.3f}".format(acc_test_svm))

In [None]:
results = pd.DataFrame({ 'ML Model': ML_Model, 'Train Accuracy': acc_train, 'Test Accuracy': acc_test})
results

In [None]:
#Sorting the datafram on accuracy
results.sort_values(by=['Test Accuracy', 'Train Accuracy'], ascending=False)

In [None]:
# XGBoost model to file
import pickle
pickle.dump(xgb, open("xgb.pkl", "wb"))