# Random Forest Classification

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle
from sklearn.metrics import f1_score

## Importing the dataset

In [None]:
dataset = pd.read_csv('final.csv')
X = dataset.iloc[:, 1:83].values
y = dataset.iloc[:, 84].values

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)
filename = 'imputer.sav'
pickle.dump(imputer, open(filename, 'wb'))

In [None]:
print(X.shape)



(9997, 82)


In [None]:
print(np.isnan(X.any())) 
print(np.isfinite(X.all()))

False
True


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [None]:
print(X_train.shape)

(7497, 82)


## Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
filename = 'scalerStatic.sav'
pickle.dump(sc, open(filename, 'wb'))

In [None]:
print(X_train.shape)

(7497, 82)


In [None]:
print(X_test)

[[ 0.00000000e+00 -8.30042102e-01 -8.21160541e-02 ... -7.83378052e-02
  -6.68866889e-02 -3.02654758e-01]
 [ 0.00000000e+00 -4.34866790e-01 -8.21160541e-02 ... -6.40773967e-02
  -5.80531684e-02 -1.77777863e-01]
 [ 0.00000000e+00 -8.30042102e-01 -8.21160541e-02 ... -7.83378052e-02
  -6.68866889e-02 -3.02654758e-01]
 ...
 [ 0.00000000e+00 -4.34866790e-01 -8.21160541e-02 ...  3.21796263e-02
   2.45219819e-04  6.41720506e-01]
 [ 0.00000000e+00 -3.96914779e-02 -8.21160541e-02 ... -1.68401539e-02
  -6.24697130e-02  2.17825219e-01]
 [ 0.00000000e+00  1.14583446e+00 -8.21160541e-02 ... -6.76424716e-02
  -6.68862576e-02 -2.16313799e-01]]


## Training the Random Forest Classification model on the Training set

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 82)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [None]:
print(X_train.shape)
filename = 'pcaStatic.sav'
pickle.dump(pca, open(filename, 'wb'))

(7497, 82)


In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [None]:
filename = 'staticAnalysisRandomForest.sav'
pickle.dump(classifier, open(filename, 'wb'))

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[1 1]
 [0 0]
 [1 1]
 ...
 [0 0]
 [1 1]
 [0 0]]


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
print(accuracy_score(y_test, y_pred))
print('f1_score ='+ str(f1_score(y_test ,y_pred , average='weighted')))

[[1219   16]
 [  52 1213]]
0.9728
f1_score =0.9727990599625853
