In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 150
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, accuracy_score

In [68]:
data = pd.read_csv('drive/MyDrive/DATA_310/Project4_Data.csv')

In [69]:
data

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6,Col7,ClassificationLabel
0,Word1 Word2 Word3 Word4 Word5,44912133,3575.00,Word771 Word772,8/31/2018,Word560 Word561 Word366 Word562,Doc1,Category_1
1,Word6 Word7,10719,1092.50,Word711 Word773 Word774,8/31/2018,Word563 Word366 Word562,NoDoc,Category_2
2,Word8 Word9 Word10 Word11 Word5,4.80Z+11,75,Word775 Word776,10/31/2018,Word563 Word564 Word9 Word26,NoDoc,Category_1
3,Word12 Word13 Word7,10731,30980.00,Word777 Word45 Word160 Word45 Word778,1/1/2018,Word565 Word566,NoDoc,Category_1
4,Word8 Word9 Word10 Word11 Word5,4.80F+11,3766.80,Word779 Word258 Word780,1/1/2018,Word567 Word568 Word569 Word570,Doc1,Category_1
...,...,...,...,...,...,...,...,...
5894,Word177 Word178 Word147 Word7,180LSY,16880.00,Word2127 Word705 Word825,3/1/2018,Word576 Word366 Word562,Doc3,Category_1
5895,Word16 Word17 Word18 Word7,BL398,4200.00,Word785 Word330 Word99 Word26,6/30/2018,Word563 Word564 Word9 Word26,NoDoc,Category_1
5896,Word212 Word213 Word214 Word125 Word119 Word39,74211,170,Word1018 Word190 Word1403,1/1/2018,Word563 Word608 Word609,NoDoc,Category_2
5897,Word148 Word75 Word5,11695,14376.00,Word846 Word991,4/1/2018,Word560 Word561 Word315 Word638,Doc1,Category_1


Decide which columns have useful information for classification

In [6]:
len(data.Col2.unique())

4818

In [8]:
# check how many unique dates there are
len(data.Col5.unique())

17

Lets try with just the text (Columns 1, 4, and 6)

In [9]:
data.ClassificationLabel.unique()

array(['Category_1', 'Category_2', 'Category_3', 'Categry_6', 'Category4',
       'Category_6', 'Category 5', 'category_1', 'Category 3',
       'Category2', 'Category _3'], dtype=object)

In [10]:
# so we should combine same categories that have different spelling/syntax
Cat_1 = data[(data.ClassificationLabel == 'Category_1') | (data.ClassificationLabel == 'category_1')]

In [11]:
Cat_2 = data[(data.ClassificationLabel == 'Category_2') | (data.ClassificationLabel == 'Category2')]

In [12]:
Cat_3 = data[(data.ClassificationLabel == 'Category_3') | (data.ClassificationLabel == 'Category 3') | (data.ClassificationLabel == 'Category _3')]

In [13]:
Cat_4 = data[(data.ClassificationLabel == 'Category4')]

In [14]:
Cat_5 = data[(data.ClassificationLabel == 'Category 5')]

In [15]:
Cat_6 = data[(data.ClassificationLabel == 'Categry_6') | (data.ClassificationLabel == 'Category_6')]

Drop NaNs

In [16]:
data.dropna()

Unnamed: 0,Col1,Col3,Col4,Col5,Col6,Col7,ClassificationLabel
0,Word1 Word2 Word3 Word4 Word5,3575.00,Word771 Word772,8/31/2018,Word560 Word561 Word366 Word562,Doc1,Category_1
1,Word6 Word7,1092.50,Word711 Word773 Word774,8/31/2018,Word563 Word366 Word562,NoDoc,Category_2
2,Word8 Word9 Word10 Word11 Word5,75,Word775 Word776,10/31/2018,Word563 Word564 Word9 Word26,NoDoc,Category_1
3,Word12 Word13 Word7,30980.00,Word777 Word45 Word160 Word45 Word778,1/1/2018,Word565 Word566,NoDoc,Category_1
4,Word8 Word9 Word10 Word11 Word5,3766.80,Word779 Word258 Word780,1/1/2018,Word567 Word568 Word569 Word570,Doc1,Category_1
...,...,...,...,...,...,...,...
5894,Word177 Word178 Word147 Word7,16880.00,Word2127 Word705 Word825,3/1/2018,Word576 Word366 Word562,Doc3,Category_1
5895,Word16 Word17 Word18 Word7,4200.00,Word785 Word330 Word99 Word26,6/30/2018,Word563 Word564 Word9 Word26,NoDoc,Category_1
5896,Word212 Word213 Word214 Word125 Word119 Word39,170,Word1018 Word190 Word1403,1/1/2018,Word563 Word608 Word609,NoDoc,Category_2
5897,Word148 Word75 Word5,14376.00,Word846 Word991,4/1/2018,Word560 Word561 Word315 Word638,Doc1,Category_1


In [46]:
documents = []
for i in range(0,len(data)):
  text = data["Col1"][i]
  text = text +' '+ str(data['Col4'][i])
  text = text +' '+ str(data['Col6'][i])
  documents.append(text)

In [47]:
documents[4]

'Word8 Word9 Word10 Word11 Word5 Word779 Word258 Word780 Word567 Word568 Word569 Word570'

In [48]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(documents)

In [20]:
X

<5899x2128 sparse matrix of type '<class 'numpy.float64'>'
	with 59906 stored elements in Compressed Sparse Row format>

In [21]:
y = data['ClassificationLabel']

Check for class imbalance

In [22]:
len(Cat_1)

5215

In [23]:
len(Cat_2)

631

In [24]:
len(Cat_3)

23

In [25]:
len(Cat_4)

16

In [26]:
len(Cat_5)

2

In [27]:
len(Cat_6)

12

As shown above, many more instances of classes 1 and 2 than the rest

Rename Labels with spelling errors

In [28]:
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace=['category_1'], value = 'Category_1')
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace=['Category2'], value = 'Category_2')
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace = ['Category _3'], value = 'Category_3')
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace = ['Category4'], value = 'Category_4')
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace = ['Category 5'], value = 'Category_5')
data['ClassificationLabel'] = data['ClassificationLabel'].replace(to_replace = ['Categry_6'], value = 'Category_6')

In [29]:
data.ClassificationLabel.unique()

array(['Category_1', 'Category_2', 'Category_3', 'Category_6',
       'Category_4', 'Category_5', 'Category 3'], dtype=object)

Try Logistic Regression

In [30]:
def modelvalidation(model,x,y,nfolds=5,rs=123):
  skf = StratifiedKFold(n_splits=nfolds,shuffle=True,random_state=rs)
  accuracies = []
  confusions = []

  for idxtrain, idxtest in tqdm(skf.split(x,y)):
    xtrain = x[idxtrain]
    ytrain = y[idxtrain]
    xtest = x[idxtest]
    ytest = y[idxtest]
    model.fit(xtrain,ytrain)
    accuracies.append(accuracy_score(ytest,model.predict(xtest)))
    print(accuracies)
    confusions.append(confusion_matrix(ytest,model.predict(xtest)))
  return accuracies, confusions

In [31]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)

In [32]:
modelvalidation(model,X,y,nfolds=5)

1it [00:00,  1.19it/s]

[0.9542372881355933]


2it [00:02,  1.10s/it]

[0.9542372881355933, 0.9533898305084746]


3it [00:03,  1.32s/it]

[0.9542372881355933, 0.9533898305084746, 0.9372881355932203]


4it [00:05,  1.48s/it]

[0.9542372881355933, 0.9533898305084746, 0.9372881355932203, 0.9483050847457627]


5it [00:07,  1.41s/it]

[0.9542372881355933, 0.9533898305084746, 0.9372881355932203, 0.9483050847457627, 0.9414758269720102]





([0.9542372881355933,
  0.9533898305084746,
  0.9372881355932203,
  0.9483050847457627,
  0.9414758269720102],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1034,    9,    0,    0,    0],
         [   0,   37,   90,    0,    0,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1033,   10,    0,    0,    0],
         [   0,   37,   89,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1031,   12,    0,    0,    0,    0],
         [   0,   53,   73,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Now lets try Decision Tree Classifier

In [33]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=20,min_samples_leaf=400)

In [34]:
modelvalidation(model,X,y)

2it [00:00, 10.45it/s]

[0.8838983050847458]
[0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


5it [00:00, 13.14it/s]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]





([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Now lets try Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=1000,max_depth=1000,min_samples_leaf=100,random_state=310)

In [36]:
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.8838983050847458]


1it [00:05,  5.71s/it]

[0.8838983050847458, 0.8838983050847458]


2it [00:13,  6.74s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


3it [00:19,  6.57s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


4it [00:22,  5.10s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:25,  5.05s/it]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Now lets try SVM

In [49]:
from sklearn.svm import SVC
model = SVC(kernel='poly',degree=3,C=2.0)

In [50]:
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.9610169491525423]


1it [00:06,  6.28s/it]

[0.9610169491525423, 0.9652542372881356]


2it [00:10,  5.19s/it]

[0.9610169491525423, 0.9652542372881356, 0.961864406779661]


3it [00:15,  4.79s/it]

[0.9610169491525423, 0.9652542372881356, 0.961864406779661, 0.9525423728813559]


4it [00:19,  4.58s/it]

[0.9610169491525423, 0.9652542372881356, 0.961864406779661, 0.9525423728813559, 0.9584393553859203]


5it [00:23,  4.73s/it]


([0.9610169491525423,
  0.9652542372881356,
  0.961864406779661,
  0.9525423728813559,
  0.9584393553859203],
 [array([[   1,    0,    0,    1,    0,    0],
         [   0, 1037,    5,    0,    1,    0],
         [   0,   34,   92,    0,    1,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    1,    0,    0,    2,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1038,    4,    0,    0,    1],
         [   0,   30,   96,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    3,    0,    0,    1,    0],
         [   0,    1,    0,    0,    0,    1]]),
  array([[   1,    0,    0,    1,    0,    0,    0],
         [   0, 1036,    6,    0,    1,    0,    0],
         [   0,   33,   93,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    0,    0,    1,    2,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
    

# Now lets try with concatenating Col7 to the descriptions

In [39]:
documents = []
for i in range(0,len(data)):
  text = data["Col1"][i]
  text = text +' '+ str(data['Col4'][i])
  text = text +' '+ str(data['Col6'][i])
  text = text + ' ' + str(data['Col7'][i])
  documents.append(text)

In [42]:
documents[1]

'Word6 Word7 Word711 Word773 Word774 Word563 Word366 Word562 NoDoc'

In [40]:
X = tfidf.fit_transform(documents)

Logistic Regression

In [41]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)
modelvalidation(model,X,y)

1it [00:01,  1.90s/it]

[0.9550847457627119]


2it [00:03,  1.72s/it]

[0.9550847457627119, 0.9533898305084746]


3it [00:05,  1.73s/it]

[0.9550847457627119, 0.9533898305084746, 0.9372881355932203]


4it [00:07,  1.88s/it]

[0.9550847457627119, 0.9533898305084746, 0.9372881355932203, 0.9491525423728814]


5it [00:09,  1.85s/it]

[0.9550847457627119, 0.9533898305084746, 0.9372881355932203, 0.9491525423728814, 0.9414758269720102]





([0.9550847457627119,
  0.9533898305084746,
  0.9372881355932203,
  0.9491525423728814,
  0.9414758269720102],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1034,    9,    0,    0,    0],
         [   0,   36,   91,    0,    0,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1033,   10,    0,    0,    0],
         [   0,   37,   89,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1031,   12,    0,    0,    0,    0],
         [   0,   53,   73,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Decision Tree

In [43]:
model = DecisionTreeClassifier(max_depth=20,min_samples_leaf=400)
modelvalidation(model,X,y)

1it [00:00,  5.45it/s]

[0.8838983050847458]
[0.8838983050847458, 0.8838983050847458]


4it [00:00,  7.83it/s]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


5it [00:00,  7.27it/s]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]





([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Random Forest

In [44]:
model = RandomForestClassifier(n_estimators=1000,max_depth=1000,min_samples_leaf=100,random_state=310)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.8838983050847458]


1it [00:05,  5.95s/it]

[0.8838983050847458, 0.8838983050847458]


2it [00:10,  5.15s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


3it [00:13,  4.15s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


4it [00:16,  3.68s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:19,  3.90s/it]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

SVM

In [45]:
model = SVC(kernel='poly',degree=3,C=2.0)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.9610169491525423]


1it [00:05,  5.55s/it]

[0.9610169491525423, 0.9652542372881356]


2it [00:10,  5.23s/it]

[0.9610169491525423, 0.9652542372881356, 0.9610169491525423]


3it [00:15,  4.87s/it]

[0.9610169491525423, 0.9652542372881356, 0.9610169491525423, 0.9542372881355933]


4it [00:19,  4.72s/it]

[0.9610169491525423, 0.9652542372881356, 0.9610169491525423, 0.9542372881355933, 0.9592875318066157]


5it [00:24,  4.80s/it]


([0.9610169491525423,
  0.9652542372881356,
  0.9610169491525423,
  0.9542372881355933,
  0.9592875318066157],
 [array([[   1,    0,    0,    1,    0,    0],
         [   0, 1037,    5,    0,    1,    0],
         [   0,   34,   92,    0,    1,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    1,    0,    0,    2,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1038,    4,    0,    0,    1],
         [   0,   30,   96,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    3,    0,    0,    1,    0],
         [   0,    1,    0,    0,    0,    1]]),
  array([[   1,    0,    0,    1,    0,    0,    0],
         [   0, 1036,    6,    0,    1,    0,    0],
         [   0,   34,   92,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    0,    0,    1,    2,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

## Concatenating info from Col7 makes no real difference as shown above, lets try Col 5

In [53]:
documents = []
for i in range(0,len(data)):
  text = data["Col1"][i]
  text = text +' '+ str(data['Col4'][i])
  text = text +' '+ str(data['Col6'][i])
  text = text + ' ' + str(data['Col5'][i])
  documents.append(text)

In [54]:
documents[1]

'Word6 Word7 Word711 Word773 Word774 Word563 Word366 Word562 8/31/2018'

In [55]:
X = tfidf.fit_transform(documents)

Logistic Regression

In [56]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)
modelvalidation(model,X,y)

1it [00:01,  1.23s/it]

[0.9550847457627119]


2it [00:02,  1.27s/it]

[0.9550847457627119, 0.9542372881355933]


3it [00:03,  1.29s/it]

[0.9550847457627119, 0.9542372881355933, 0.9364406779661016]


4it [00:05,  1.40s/it]

[0.9550847457627119, 0.9542372881355933, 0.9364406779661016, 0.95]


5it [00:06,  1.31s/it]

[0.9550847457627119, 0.9542372881355933, 0.9364406779661016, 0.95, 0.9457167090754877]





([0.9550847457627119,
  0.9542372881355933,
  0.9364406779661016,
  0.95,
  0.9457167090754877],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1033,   10,    0,    0,    0],
         [   0,   35,   92,    0,    0,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1034,    9,    0,    0,    0],
         [   0,   37,   89,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1031,   12,    0,    0,    0,    0],
         [   0,   54,   72,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
         [   0,  

Decision Tree

In [57]:
model = DecisionTreeClassifier(max_depth=20,min_samples_leaf=400)
modelvalidation(model,X,y)

3it [00:00, 23.21it/s]

[0.8838983050847458]
[0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:00, 19.96it/s]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Random Forest

In [58]:
model = RandomForestClassifier(n_estimators=1000,max_depth=1000,min_samples_leaf=100,random_state=310)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.8838983050847458]


1it [00:05,  5.01s/it]

[0.8838983050847458, 0.8838983050847458]


2it [00:08,  4.37s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


3it [00:11,  3.76s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


4it [00:14,  3.47s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:17,  3.60s/it]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

SVM

In [59]:
model = SVC(kernel='poly',degree=3,C=2.0)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.9601694915254237]


1it [00:06,  6.16s/it]

[0.9601694915254237, 0.9635593220338983]


2it [00:11,  5.55s/it]

[0.9601694915254237, 0.9635593220338983, 0.9635593220338983]


3it [00:16,  5.58s/it]

[0.9601694915254237, 0.9635593220338983, 0.9635593220338983, 0.9550847457627119]


4it [00:22,  5.42s/it]

[0.9601694915254237, 0.9635593220338983, 0.9635593220338983, 0.9550847457627119, 0.9584393553859203]


5it [00:27,  5.43s/it]


([0.9601694915254237,
  0.9635593220338983,
  0.9635593220338983,
  0.9550847457627119,
  0.9584393553859203],
 [array([[   1,    0,    0,    1,    0,    0],
         [   0, 1036,    6,    0,    1,    0],
         [   0,   34,   92,    0,    1,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    1,    0,    0,    2,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1035,    7,    0,    0,    1],
         [   0,   28,   98,    0,    0,    0],
         [   1,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    1,    0],
         [   0,    1,    0,    0,    0,    1]]),
  array([[   1,    0,    0,    1,    0,    0,    0],
         [   0, 1036,    6,    0,    1,    0,    0],
         [   0,   31,   95,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    0,    0,    1,    2,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

# As shown above, concatenating Col5 makes no real difference as well, so lets try Col3

In [61]:
documents = []
for i in range(0,len(data)):
  text = data["Col1"][i]
  text = text +' '+ str(data['Col4'][i])
  text = text +' '+ str(data['Col6'][i])
  text = text + ' ' + str(data['Col3'][i])
  documents.append(text)

In [62]:
documents[1]

'Word6 Word7 Word711 Word773 Word774 Word563 Word366 Word562 1,092.50'

In [63]:
X = tfidf.fit_transform(documents)

Logistic Regression

In [64]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)
modelvalidation(model,X,y)

1it [00:01,  1.40s/it]

[0.9533898305084746]


2it [00:02,  1.33s/it]

[0.9533898305084746, 0.9525423728813559]


3it [00:04,  1.46s/it]

[0.9533898305084746, 0.9525423728813559, 0.9398305084745763]


4it [00:05,  1.38s/it]

[0.9533898305084746, 0.9525423728813559, 0.9398305084745763, 0.9516949152542373]


5it [00:06,  1.30s/it]

[0.9533898305084746, 0.9525423728813559, 0.9398305084745763, 0.9516949152542373, 0.9423240033927057]





([0.9533898305084746,
  0.9525423728813559,
  0.9398305084745763,
  0.9516949152542373,
  0.9423240033927057],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1035,    8,    0,    0,    0],
         [   0,   39,   88,    0,    0,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1034,    9,    0,    0,    0],
         [   0,   39,   87,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1033,   10,    0,    0,    0,    0],
         [   0,   52,   74,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Decision Tree

In [65]:
model = DecisionTreeClassifier(max_depth=20,min_samples_leaf=400)
modelvalidation(model,X,y)

2it [00:00, 14.39it/s]

[0.8838983050847458]
[0.8838983050847458, 0.8966101694915254]
[0.8838983050847458, 0.8966101694915254, 0.8838983050847458]


5it [00:00, 13.31it/s]

[0.8838983050847458, 0.8966101694915254, 0.8838983050847458, 0.8966101694915254]
[0.8838983050847458, 0.8966101694915254, 0.8838983050847458, 0.8966101694915254, 0.8846480067854113]





([0.8838983050847458,
  0.8966101694915254,
  0.8838983050847458,
  0.8966101694915254,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[  0,   2,   0,   0,   0,   0],
         [  0, 989,  54,   0,   0,   0],
         [  0,  57,  69,   0,   0,   0],
         [  0,   3,   0,   0,   0,   0],
         [  0,   4,   0,   0,   0,   0],
         [  0,   2,   0,   0,   0,   0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,

Random Forest

In [66]:
model = RandomForestClassifier(n_estimators=1000,max_depth=1000,min_samples_leaf=100,random_state=310)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.8838983050847458]


1it [00:04,  4.33s/it]

[0.8838983050847458, 0.8838983050847458]


2it [00:07,  3.43s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


3it [00:10,  3.18s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


4it [00:12,  3.03s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:16,  3.25s/it]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

SVM

In [67]:
model = SVC(kernel='poly',degree=3,C=2.0)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.9584745762711865]


1it [00:11, 11.10s/it]

[0.9584745762711865, 0.961864406779661]


2it [00:20,  9.87s/it]

[0.9584745762711865, 0.961864406779661, 0.9593220338983051]


3it [00:29,  9.46s/it]

[0.9584745762711865, 0.961864406779661, 0.9593220338983051, 0.9559322033898305]


4it [00:38,  9.45s/it]

[0.9584745762711865, 0.961864406779661, 0.9593220338983051, 0.9559322033898305, 0.9516539440203562]


5it [00:47,  9.53s/it]


([0.9584745762711865,
  0.961864406779661,
  0.9593220338983051,
  0.9559322033898305,
  0.9516539440203562],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1040,    3,    0,    0,    0],
         [   0,   38,   89,    0,    0,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1039,    4,    0,    0,    0],
         [   0,   34,   92,    0,    0,    0],
         [   0,    0,    0,    3,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    1]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1040,    3,    0,    0,    0,    0],
         [   0,   38,   88,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    0,    0,    1,    2,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
    

# Again, Col3 makes no real difference in classification accuracy. Finally, we will try Col2

In [70]:
documents = []
for i in range(0,len(data)):
  text = data["Col1"][i]
  text = text +' '+ str(data['Col4'][i])
  text = text +' '+ str(data['Col6'][i])
  text = text + ' ' + str(data['Col2'][i])
  documents.append(text)

In [71]:
documents[1]

'Word6 Word7 Word711 Word773 Word774 Word563 Word366 Word562 10719'

In [72]:
X = tfidf.fit_transform(documents)

Logistic Regression

In [73]:
model = LogisticRegression(solver='lbfgs', max_iter=10000)
modelvalidation(model,X,y)

1it [00:02,  2.75s/it]

[0.9508474576271186]


2it [00:05,  2.67s/it]

[0.9508474576271186, 0.9483050847457627]


3it [00:07,  2.53s/it]

[0.9508474576271186, 0.9483050847457627, 0.9364406779661016]


4it [00:09,  2.40s/it]

[0.9508474576271186, 0.9483050847457627, 0.9364406779661016, 0.9533898305084746]


5it [00:12,  2.50s/it]

[0.9508474576271186, 0.9483050847457627, 0.9364406779661016, 0.9533898305084746, 0.9397794741306191]





([0.9508474576271186,
  0.9483050847457627,
  0.9364406779661016,
  0.9533898305084746,
  0.9397794741306191],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1037,    6,    0,    0,    0],
         [   0,   42,   85,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1034,    9,    0,    0,    0],
         [   0,   41,   85,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1032,   11,    0,    0,    0,    0],
         [   0,   53,   73,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Decision Tree

In [74]:
model = DecisionTreeClassifier(max_depth=20,min_samples_leaf=400)
modelvalidation(model,X,y)

2it [00:00,  8.86it/s]

[0.8838983050847458]
[0.8838983050847458, 0.8838983050847458]


3it [00:00,  9.31it/s]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]
[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:00,  9.42it/s]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

Random Forest

In [75]:
model = RandomForestClassifier(n_estimators=1000,max_depth=1000,min_samples_leaf=100,random_state=310)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.8838983050847458]


1it [00:04,  4.84s/it]

[0.8838983050847458, 0.8838983050847458]


2it [00:07,  3.68s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


3it [00:10,  3.29s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458]


4it [00:13,  3.13s/it]

[0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8838983050847458, 0.8846480067854113]


5it [00:16,  3.26s/it]


([0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8838983050847458,
  0.8846480067854113],
 [array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  127,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    4,    0,    0,    0,    0],
         [   0,    2,    0,    0,    0,    0]]),
  array([[   0,    2,    0,    0,    0,    0,    0],
         [   0, 1043,    0,    0,    0,    0,    0],
         [   0,  126,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

SVM

In [76]:
model = SVC(kernel='poly',degree=3,C=2.0)
modelvalidation(model,X,y)

0it [00:00, ?it/s]

[0.9576271186440678]


1it [00:12, 12.53s/it]

[0.9576271186440678, 0.9593220338983051]


2it [00:22, 11.09s/it]

[0.9576271186440678, 0.9593220338983051, 0.9584745762711865]


3it [00:32, 10.35s/it]

[0.9576271186440678, 0.9593220338983051, 0.9584745762711865, 0.9542372881355933]


4it [00:43, 10.62s/it]

[0.9576271186440678, 0.9593220338983051, 0.9584745762711865, 0.9542372881355933, 0.9516539440203562]


5it [00:52, 10.58s/it]


([0.9576271186440678,
  0.9593220338983051,
  0.9584745762711865,
  0.9542372881355933,
  0.9516539440203562],
 [array([[   0,    1,    0,    1,    0,    0],
         [   0, 1039,    4,    0,    0,    0],
         [   0,   37,   89,    0,    1,    0],
         [   0,    0,    0,    2,    0,    0],
         [   0,    3,    0,    0,    0,    0],
         [   0,    3,    0,    0,    0,    0]]),
  array([[   0,    1,    0,    1,    0,    0],
         [   0, 1040,    3,    0,    0,    0],
         [   0,   37,   89,    0,    0,    0],
         [   1,    0,    0,    2,    0,    0],
         [   0,    3,    1,    0,    0,    0],
         [   0,    1,    0,    0,    0,    1]]),
  array([[   0,    1,    0,    1,    0,    0,    0],
         [   0, 1039,    3,    0,    1,    0,    0],
         [   0,   39,   87,    0,    0,    0,    0],
         [   0,    1,    0,    2,    0,    0,    0],
         [   0,    1,    0,    0,    2,    0,    0],
         [   0,    1,    0,    0,    0,    0,    0],
   

# Again, Col2 makes no real difference in classification accuracy

# The best model is SVM with the text from Col1, Col4, and Col6 as it is the most accurate (mean accuracy $\geq$ 0.96) without having unneccesary information in the classification