In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

np.set_printoptions(suppress=True)
df1 = pd.read_csv("C:/Users/arjun/jup/Project/training.txt",sep="\s+",header=None)
df1.columns=['ID','Features','Values']
mat1 = pd.pivot_table(df1, index='ID', columns='Features', values='Values')
mat1 = mat1.fillna(0)
df2 = pd.read_csv("C:/Users/arjun/jup/Project/label_training.txt",sep="\t",header=None)
df2.columns = ['Classes']

svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)
mat1 = svd.fit_transform(mat1)

xtrain, xtest = train_test_split(mat1, test_size=0.2, shuffle=False)
ytrain, ytest = train_test_split(df2, test_size=0.2, shuffle=False)

In [2]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

clf = RandomForestClassifier()
clf = RandomizedSearchCV(estimator = clf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [3]:
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)
clf = clf.fit(xtrain, ytrain)
predY = clf.predict(xtest)
accuracy_score(ytest,predY)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   53.0s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  8.6min finished


0.9105691056910569

In [4]:
ytrain = np.ravel(ytrain)
ytest = np.ravel(ytest)

In [5]:
from yellowbrick.classifier import ClassPredictionError, ClassificationReport, ROCAUC, PrecisionRecallCurve, ConfusionMatrix
model = RandomForestClassifier()
classes = [1,1,-1]
viz = ClassificationReport(model, classes=classes, support=True)
viz.fit(xtrain,ytrain)
viz.score(xtest,ytest)
viz.show()

<Figure size 800x550 with 2 Axes>

<matplotlib.axes._subplots.AxesSubplot at 0x11c825a12b0>

In [6]:
ytest

array([ 1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1,  1, -1, -1, -1,  1,
       -1, -1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1,
       -1,  1, -1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1, -1,
       -1, -1, -1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1,
        1,  1, -1, -1, -1,  1, -1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,
        1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1,  1, -1, -1,  1,  1,  1,
        1, -1, -1,  1,  1, -1,  1,  1,  1, -1,  1, -1,  1,  1,  1,  1,  1,
        1, -1,  1, -1, -1,  1, -1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,
       -1, -1,  1, -1, -1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1,
        1, -1, -1,  1,  1, -1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,
       -1, -1, -1, -1,  1,  1, -1,  1,  1, -1, -1,  1,  1, -1, -1, -1,  1,
        1,  1,  1, -1,  1, -1,  1, -1,  1,  1,  1, -1,  1,  1, -1,  1, -1,
        1,  1, -1, -1, -1