In [None]:
import numpy as np                                       # for dealing with data
import pandas as pd                                      # for reading data
import matplotlib.pyplot as plt                          # for plotting 
import sklearn                                           # for machine learning models
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler
from google.colab import drive

In [None]:
drive.mount('/content/drive')
data=pd.read_csv("/content/drive/MyDrive/Colab Notebooks/traincombinefinal.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
X=data.iloc[:,0:68] # To select particular rows and columns for training and predictions.
y=data.iloc[:,68]   

In [None]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=55     # Random state sets a seed so that the train and test datasets are reproducible.
)

## ***Random Forest***

In [None]:
# create the classifier
classifier = RandomForestClassifier(n_estimators=500)   # n_estimators is the no of trees you want to bulid before taking average predictions.

# Train the model using the training sets
classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=500)

In [None]:
# predictin on the test set
y_pred = classifier.predict(X_test)

In [None]:
# Calculate Model Accuracy
RFC_acc=accuracy_score(y_test, y_pred)
print("Accuracy:", RFC_acc)

Accuracy: 0.999951923076923


In [None]:
cm=sklearn.metrics.confusion_matrix(y_test,y_pred)  # Use to tell TP, TN, FP, FN
print(cm)

[[10343     0]
 [    1 10456]]


## **Support vector Machine(SVM)**

In [None]:
svclassifier = SVC(kernel='linear')  # Linear kernel is used because the data set is linearly separable and also because their are a large number of features in the dataset.
svclassifier.fit(X_train, y_train)

In [None]:
y_pred = svclassifier.predict(X_test)
SVM_acc=accuracy_score(y_test, y_pred)
print("Accuracy:", SVM_acc)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
# recall->ratio of true positives to the sum of true positives and false negatives.
# f1-score->weighted average of the precision and recall values

### **Gradient Boost Classifier** 

In [None]:
# find optimal learning rate value
learning_rate =  [0.01, 0.05, 0.1, 0.5, 1];
for n in learning_rate:
 gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=n, max_depth=1)
 gbc.fit(X_train, np.ravel(y_train, order='C'))
 ypred = gbc.predict(X_test)
 acc=gbc.score(X_test, y_test) 
 print("Learning rate: ",n, "  Accuracy: ", acc)
 print("Confusion matrix:")
 print(confusion_matrix(y_test, ypred))

In [None]:
# find optimal number of estimators
estimators =  [10,50,100,200,500];
for e in estimators:
 gbc = GradientBoostingClassifier(n_estimators=e, learning_rate=1, max_depth=1)
 gbc.fit(X_train, np.ravel(y_train, order='C'))
 ypred = gbc.predict(X_test)
 acc=gbc.score(X_test, y_test)
 print("Number of estimators: ",e, "  Accuracy: ", acc)
 print("Confusion matrix:")
 print(confusion_matrix(y_test, ypred))

In [None]:
gbc = GradientBoostingClassifier(n_estimators=500, learning_rate=1, max_depth=1)
gbc.fit(X_train, np.ravel(y_train, order='C'))
ypred = gbc.predict(X_test)

In [None]:
GradientBoostingClassifier() 
GBC_acc=gbc.score(X_test, y_test)
print(GBC_acc)

In [None]:
X_bar = ['Random Forest','SVM','Grading Boost Classifier']
Y_bar= [RFC_acc*100,SVM_acc*100,GBC_acc*100]
import matplotlib.pyplot as plt
plt.barh(X_bar, Y_bar, align='center', color=('#C4EE73','#EEA773', '#73EED9'))
plt.xlabel("Performance Accuracy in Percentage")
plt.show()