In [1]:
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import normalize
from scipy.sparse import hstack

from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import accuracy_score, log_loss
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import os
import gensim
nltk.download('punkt')

from collections import defaultdict

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from sklearn import utils
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics import classification_report


## Getting the Data ( preprocessed )

In [0]:
df = pd.read_csv('/content/gdrive/My Drive/DataSet_d2v.csv')

In [7]:
df.head()

Unnamed: 0,ID,Gene,Variation,Class,TEXT,Com
0,0,fam58a,truncating mutations,1,cyclin-dependent kinases cdks regulate variety...,fam58a truncating mutations cyclin-dependent k...
1,1,cbl,w802*,2,abstract background non-small cell lung cancer...,cbl w802* abstract background non-small cell l...
2,2,cbl,q249e,2,abstract background non-small cell lung cancer...,cbl q249e abstract background non-small cell l...
3,3,cbl,n454d,3,recent evidence demonstrated acquired uniparen...,cbl n454d recent evidence demonstrated acquire...
4,4,cbl,l399v,4,oncogenic mutations monomeric casitas b-lineag...,cbl l399v oncogenic mutations monomeric casita...


## Forming Tagged Document suitable for DOC2VEC training 

In [0]:
def label_sentences(corpus, label_type):
    """
    Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
    We do this by using the TaggedDocument method. The format will be "TRAIN_i" or "TEST_i" where "i" is
    a dummy index of the post.
    """
    labeled = []
    for i, v in enumerate(corpus):
        label = label_type + '_' + str(i)
        labeled.append( TaggedDocument(v.split(), [label]))
    return labeled
X_train, X_test, y_train, y_test = train_test_split(df.Com, df.Class, random_state=0, test_size=0.2)
X_train = label_sentences(X_train, 'Train')
X_test = label_sentences(X_test, 'Test')
all_data = X_train + X_test

### Tagged Documents

In [0]:
all_data[:2]

[TaggedDocument(words=['nf1', 'r1276p', 'rasgaps', 'supply', 'catalytic', 'residue', 'termed', 'arginine', 'finger', 'active', 'site', 'ras', 'thereby', 'stabilizing', 'transition', 'state', 'gtpase', 'reaction', 'increasing', 'reaction', 'rate', 'one', 'thousand-fold', 'good', 'agreement', 'structure', 'ras', 'rasgap', 'complex', 'three-dimensional', 'structure', 'complex', 'human', 'h-ras', 'bound', 'guanosine', 'diphosphate', 'guanosine', 'triphosphatase', 'gtpase', '-activating', 'domain', 'human', 'gtpase-activating', 'protein', 'p1', '20gap', 'gap-334', 'presence', 'aluminum', 'fluoride', 'solved', 'resolution', '2', '5', 'angstroms', 'structure', 'shows', 'partly', 'hydrophilic', 'partly', 'hydrophobic', 'nature', 'communication', 'two', 'mol-', 'ecules', 'explains', 'sensitivity', 'interaction', 'toward', 'salts', 'lipids', 'arginine', 'side', 'chain', 'arginine-789', 'gap-334', 'supplied', 'active', 'site', 'ras', 'neutralize', 'developing', 'charges', 'transition', 'state', '

## DOC2VEC Model
      Defining the Model
      Building the Vocabolary
      Training the Model

In [0]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)
model_dbow.build_vocab([x for x in tqdm(all_data)])

for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)
    model_dbow.alpha -= 0.002
model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 3321/3321 [00:00<00:00, 937998.89it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1704722.01it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1684722.25it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1388623.63it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1841280.05it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1768797.92it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1863948.02it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1749250.73it/s]
100%|██████████| 3321/3321 [00:00<00:00, 2016690.83it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1819869.82it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1800579.57it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1923668.50it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1921015.53it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1574108.21it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1773753.16it/s]
100%|██████████| 3321/3321 [00:00<00:00, 846764.96it/s]
100%|██████████| 3321/3321 [00:00<00:00, 1765882.81it/s]
100%|██████████| 3321/3321 [00:00

### Getting the Transformed Vectors for Trainde Model

In [0]:
def get_vectors(model, corpus_size, vectors_size, vectors_type):
    """
    Get vectors from trained doc2vec model
    :param doc2vec_model: Trained Doc2Vec model
    :param corpus_size: Size of the data
    :param vectors_size: Size of the embedding vectors
    :param vectors_type: Training or Testing vectors
    :return: list of vectors
    """
    vectors = np.zeros((corpus_size, vectors_size))
    for i in range(0, corpus_size):
        prefix = vectors_type + '_' + str(i)
        vectors[i] = model.docvecs[prefix]
    return vectors
    
train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')
test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')

In [0]:
my_tags = ['1','2','3','4','5','6','7','8','9']

In [0]:
def predict_and_plot_confusion_matrix(train_x, train_y, test_x, test_y, clf):
    
    clf.fit(train_x, train_y)
    
    print('Training Done')
    sig_clf = CalibratedClassifierCV(clf)
    sig_clf.fit(train_x, train_y)
    
    
    print('Calibration Done')
    pred_y = sig_clf.predict(test_x)
    
    
    print(pred_y.shape)
    print(type(pred_y))
    # for calculating log_loss we will provide the array of probabilities belongs to each class
    print("Log loss :",log_loss(test_y, sig_clf.predict_proba(test_x)))
    print("Actual :", test_y[0])
    print("Probability :",np.round(sig_clf.predict_proba(test_x[0].reshape(1, -1) ),4 ))
    print("Predicted :", sig_clf.predict(test_x[0].reshape(1, -1) ))
    # calculating the number of data points that are misclassified
    # print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0], 1-np.count_nonzero((pred_y- test_y))/test_y.shape[0])
    print(confusion_matrix(test_y, pred_y))
    print("Score :",sig_clf.score(test_x, test_y))

In [0]:
type(train_vectors_dbow)

numpy.ndarray

In [0]:
train_vectors_dbow.shape

(2656, 300)

In [0]:
test_vectors_dbow.shape

(665, 300)

In [0]:
min( [ min(vec) for vec in train_vectors_dbow] )

## Normalization of Vectors

In [0]:
all_data = np.concatenate((train_vectors_dbow , test_vectors_dbow), axis = 0 )
all_data.shape

(3321, 300)

In [0]:
all_data  = normalize( all_data, axis = 0)

In [0]:
min_val = min( [ min(vec) for vec in all_data ])

In [0]:
for i in range(3321):
  for j in range(300):
    all_data[i][j] += abs(min_val)

In [0]:
min( [ min(vec) for vec in all_data ])

0.0

In [0]:
train_x = all_data[:2656]
test_x = all_data[2656:]

print(train_x.shape)
print(test_x.shape)

(2656, 300)
(665, 300)


In [0]:
print( type( y_train))
print( y_train.shape)

<class 'pandas.core.series.Series'>
(2656,)


In [0]:
test_y = np.array( y_test)

In [0]:
test_y[2]

2

# Classification Models

### Logistic Regression

In [0]:
clf = LogisticRegression(class_weight='balanced', solver = 'newton-cg')
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.396964289001349
Actual : 7
Probability : [[0.0111 0.3848 0.009  0.0171 0.0293 0.028  0.5156 0.0018 0.0033]]
Predicted : [7]
[[ 44   3   0  29   3   2  22   0   0]
 [  7  10   0   7   0   1  67   0   0]
 [  5   0   5   7   1   0   6   0   0]
 [ 36   0   1  75   2   3  27   0   0]
 [ 10   1   1  10  20   1  16   0   1]
 [  6   1   3   6   1  23  11   0   0]
 [ 12   6   0  19   0   1 145   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  1   0   0   0   0   0   2   0   3]]
Score : 0.48872180451127817


In [0]:
clf = LogisticRegression(class_weight='balanced')
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4198384921480467
Actual : 7
Probability : [[0.0158 0.3631 0.0117 0.0232 0.0291 0.0266 0.5249 0.0017 0.004 ]]
Predicted : [7]
[[ 44   3   0  30   3   2  21   0   0]
 [  8  10   0   7   0   0  67   0   0]
 [  5   0   5   7   1   0   6   0   0]
 [ 36   0   1  75   0   4  28   0   0]
 [ 11   1   1  10  19   1  16   0   1]
 [  8   1   2   4   0  22  14   0   0]
 [ 13   4   0  19   0   1 146   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  2   0   0   0   0   0   2   0   2]]
Score : 0.4857142857142857


In [0]:
clf = LogisticRegression(class_weight='balanced', solver = 'saga', C=1e5)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.430625587733641
Actual : 7
Probability : [[0.0236 0.4089 0.0078 0.0138 0.0436 0.1208 0.3735 0.0026 0.0054]]
Predicted : [2]
[[ 48   0   0  24   3   2  26   0   0]
 [  9  11   0   9   0   0  63   0   0]
 [  4   0   5   7   1   0   7   0   0]
 [ 38   3   1  76   0   3  23   0   0]
 [ 11   3   1   9  18   1  17   0   0]
 [  6   0   3   8   0  26   8   0   0]
 [ 11   4   3  21   0   4 140   0   0]
 [  0   0   0   1   0   0   1   0   0]
 [  0   0   0   1   0   0   2   0   3]]
Score : 0.4917293233082707


### Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, oob_score=True)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done




Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.3362977848425768
Actual : 7
Probability : [[0.099  0.2634 0.0134 0.0933 0.063  0.0449 0.4025 0.0058 0.0147]]
Predicted : [7]
[[ 54   3   0  21   0   1  24   0   0]
 [  3  22   0   6   0   0  61   0   0]
 [  4   0   5   6   1   0   8   0   0]
 [ 24   1   1  95   1   1  20   0   1]
 [ 16   0   1  10  12   0  19   0   2]
 [  8   1   3   5   2  22  10   0   0]
 [  7  11   1  12   2   0 150   0   0]
 [  0   1   0   1   0   0   0   0   0]
 [  1   0   0   0   0   0   3   0   2]]
Score : 0.544360902255639


In [0]:
import warnings
warnings.filterwarnings("ignore")

### Supoort Vector Classifier

In [0]:
clf = SGDClassifier(alpha=0.001, penalty='l2', loss='hinge', random_state=123 ,max_iter=10000, tol=0.00001)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4374537753331595
Actual : 7
Probability : [[0.0216 0.3957 0.0067 0.026  0.0378 0.0322 0.4747 0.001  0.0043]]
Predicted : [7]
[[ 57   4   0  23   0   1  18   0   0]
 [  8  15   0   7   0   1  61   0   0]
 [  6   0   5   5   1   0   7   0   0]
 [ 43   2   1  71   1   2  24   0   0]
 [ 18   1   1  11  12   1  16   0   0]
 [ 10   0   3   7   0  21  10   0   0]
 [ 14   6   0  15   0   1 147   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  2   0   0   0   0   0   1   0   3]]
Score : 0.49774436090225566


In [0]:
from sklearn.svm import SVC
clf =  SVC(kernel="linear", C=1)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.54350262438776
Actual : 7
Probability : [[0.0211 0.32   0.0017 0.0921 0.0196 0.0364 0.509  0.     0.    ]]
Predicted : [7]
[[ 19   0   0  23   1   0  60   0   0]
 [  1   0   0   2   0   0  89   0   0]
 [  2   0   4   5   1   0  12   0   0]
 [ 14   0   0  62   1   1  66   0   0]
 [  5   0   0  10  19   0  26   0   0]
 [  4   0   3  11   1  12  20   0   0]
 [  2   0   0   6   0   0 175   0   0]
 [  0   0   0   1   0   0   1   0   0]
 [  0   0   0   0   0   0   5   0   1]]
Score : 0.43909774436090226


### Ensemble Method Extra Tree Classifier

In [0]:
from sklearn.ensemble.forest import ExtraTreesClassifier
clf = ExtraTreesClassifier( n_estimators= 1000,  max_features= None,bootstrap=True, oob_score= True, class_weight='balanced', n_jobs= -1) 
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.3415641029951957
Actual : 7
Probability : [[0.0898 0.2862 0.0142 0.0971 0.0513 0.0498 0.3955 0.0069 0.0093]]
Predicted : [7]
[[ 57   3   0  13   0   1  29   0   0]
 [  2  24   0   3   0   2  61   0   0]
 [  2   0   6   7   2   0   7   0   0]
 [ 26   0   1  90   1   1  25   0   0]
 [ 18   0   1  11  10   0  19   0   1]
 [  6   2   3   7   2  22   9   0   0]
 [  6  11   1   7   2   1 155   0   0]
 [  0   1   0   1   0   0   0   0   0]
 [  1   0   0   0   0   0   3   0   2]]
Score : 0.5503759398496241


### Multinomial Naive Bayes

In [0]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4412359144579792
Actual : 7
Probability : [[0.0162 0.3501 0.014  0.0241 0.0282 0.0265 0.5345 0.0019 0.0045]]
Predicted : [7]
[[ 43   2   0  31   1   1  24   0   1]
 [  8  10   0   8   0   0  66   0   0]
 [  7   0   3   6   1   0   7   0   0]
 [ 38   0   0  74   0   4  28   0   0]
 [ 13   0   0  11  18   2  16   0   0]
 [  8   1   1   3   0  22  16   0   0]
 [ 13   3   0  18   0   1 148   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  2   0   0   0   0   0   2   0   2]]
Score : 0.48120300751879697


# Other Classification Models

     
    ExtraTreeClassifier( 55.03)
    LinearDiscriminantAnalysis( 49.32)
    QuadraticDiscriminantAnalysis( 44.66)
    XGBoost( 49.92)
    KNeighborsClassifier( 55.48)
    Perceptron( 50.97)
    MultiLayerPerceptronClassifier( 49.47)


In [0]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
clf = QuadraticDiscriminantAnalysis()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.6952471727629512
Actual : 7
Probability : [[0.149  0.1925 0.036  0.1577 0.0981 0.1204 0.2176 0.0106 0.0181]]
Predicted : [7]
[[ 37   6   0  29   0   0  31   0   0]
 [ 11   8   0   2   0   0  71   0   0]
 [  3   1   0   8   0   0  12   0   0]
 [ 13   7   0  97   0   0  27   0   0]
 [ 22   0   0  17   0   0  21   0   0]
 [ 12   6   0  13   0   0  20   0   0]
 [ 15   8   0   5   0   0 155   0   0]
 [  0   0   0   0   0   0   2   0   0]
 [  1   0   0   1   0   0   4   0   0]]
Score : 0.44661654135338347


In [0]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.8521920655202888
Actual : 7
Probability : [[0.175  0.1355 0.0246 0.2036 0.0685 0.0843 0.2898 0.0067 0.0118]]
Predicted : [7]
[[  0   0   0   0   0   0 103   0   0]
 [  0   0   0   0   0   0  92   0   0]
 [  0   0   0   0   0   0  24   0   0]
 [  0   0   0   0   0   0 144   0   0]
 [  0   0   0   0   0   0  60   0   0]
 [  0   0   0   0   0   0  51   0   0]
 [  0   0   0   0   0   0 183   0   0]
 [  0   0   0   0   0   0   2   0   0]
 [  0   0   0   0   0   0   6   0   0]]
Score : 0.27518796992481204


In [0]:
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.434463944747063
Actual : 7
Probability : [[0.0163 0.3507 0.0112 0.024  0.0273 0.0254 0.5415 0.001  0.0026]]
Predicted : [7]
[[ 41   3   0  30   1   1  27   0   0]
 [  7  11   0   8   0   0  66   0   0]
 [  5   0   5   6   1   0   7   0   0]
 [ 37   1   1  74   0   4  27   0   0]
 [ 13   0   0  10  18   2  16   0   1]
 [  8   1   2   2   0  22  16   0   0]
 [ 12   3   0  19   0   1 148   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  1   0   0   0   0   0   2   0   3]]
Score : 0.4842105263157895


In [0]:
from xgboost import XGBClassifier
clf = XGBClassifier(max_depth=4,objective='multi:softprob',learning_rate=0.03333)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4702960243821772
Actual : 7
Probability : [[0.1117 0.3416 0.0215 0.1631 0.0877 0.0467 0.2094 0.0067 0.0116]]
Predicted : [2]
[[ 49   2   0  19   0   2  31   0   0]
 [  4  12   0  13   0   2  61   0   0]
 [  4   0   4   7   1   0   8   0   0]
 [ 31   2   1  81   0   1  28   0   0]
 [ 10   1   2  13  17   0  17   0   0]
 [ 10   0   3   5   3  19  11   0   0]
 [ 15   4   0  11   2   2 149   0   0]
 [  0   0   0   0   0   0   2   0   0]
 [  0   0   0   1   0   0   4   0   1]]
Score : 0.4992481203007519


In [0]:
from sklearn.ensemble.bagging import BaggingClassifier
clf = BaggingClassifier()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.6369059227411444
Actual : 7
Probability : [[0.1194 0.1335 0.0261 0.2497 0.0512 0.06   0.3427 0.0069 0.0105]]
Predicted : [7]
[[ 33   1   0  16   0   0  53   0   0]
 [ 10   1   0   2   0   3  76   0   0]
 [  3   0   1   9   1   0  10   0   0]
 [ 20   0   0  72   0   1  51   0   0]
 [ 15   0   1   9   7   1  27   0   0]
 [  9   0   0   9   0  18  15   0   0]
 [ 14   1   0  15   1   0 152   0   0]
 [  0   1   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   6   0   0]]
Score : 0.4270676691729323


In [0]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.442959858850288
Actual : 7
Probability : [[0.0389 0.42   0.0048 0.0099 0.0205 0.0884 0.4088 0.0021 0.0066]]
Predicted : [2]
[[ 49   1   0  26   1   3  23   0   0]
 [  8   9   0  10   0   0  65   0   0]
 [  5   0   4   6   1   0   8   0   0]
 [ 38   3   1  79   1   2  20   0   0]
 [ 17   2   1   8  14   0  18   0   0]
 [  9   0   3   8   0  23   8   0   0]
 [ 10   2   0  20   0   3 148   0   0]
 [  0   0   0   1   0   0   1   0   0]
 [  1   0   0   0   0   0   3   0   2]]
Score : 0.4932330827067669


In [0]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.3485624973835162
Actual : 7
Probability : [[0.1432 0.3195 0.0134 0.1052 0.0651 0.0573 0.2805 0.0065 0.0093]]
Predicted : [2]
[[ 56   2   0  18   2   2  22   0   1]
 [  5  17   0   1   0   1  68   0   0]
 [  1   0   7   4   2   1   9   0   0]
 [ 23   1   2  95   2   1  20   0   0]
 [ 18   1   1  11  12   1  16   0   0]
 [  9   0   3   4   2  24   9   0   0]
 [ 11  12   1   3   2   1 153   0   0]
 [  0   1   0   0   0   0   1   0   0]
 [  0   0   0   0   0   0   1   0   5]]
Score : 0.5548872180451128


In [0]:
from sklearn.linear_model import Perceptron
clf = Perceptron(max_iter=10000, tol=0.00001)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4090203807677415
Actual : 7
Probability : [[0.0195 0.4316 0.0084 0.0112 0.0419 0.0582 0.4236 0.0015 0.0042]]
Predicted : [2]
[[ 49   2   0  31   2   1  18   0   0]
 [  9  12   0   6   0   1  64   0   0]
 [  5   0   5   7   1   0   6   0   0]
 [ 37   2   1  79   1   1  23   0   0]
 [ 11   2   1  11  17   1  17   0   0]
 [  5   1   3   7   1  26   8   0   0]
 [ 11   5   0  15   0   4 148   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  1   0   0   1   0   0   1   0   3]]
Score : 0.5097744360902255


In [0]:
from sklearn.neural_network.multilayer_perceptron import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(500,300,100), max_iter=1000, alpha=0.0001,solver='adam', verbose=10,  random_state=21, tol=0.000000001)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Iteration 1, loss = 1.95291406
Iteration 2, loss = 1.83411986
Iteration 3, loss = 1.82984411
Iteration 4, loss = 1.82769588
Iteration 5, loss = 1.82321827
Iteration 6, loss = 1.82147999
Iteration 7, loss = 1.81902003
Iteration 8, loss = 1.81286373
Iteration 9, loss = 1.81016312
Iteration 10, loss = 1.79760679
Iteration 11, loss = 1.78163335
Iteration 12, loss = 1.76353973
Iteration 13, loss = 1.73346741
Iteration 14, loss = 1.69118335
Iteration 15, loss = 1.63565266
Iteration 16, loss = 1.60745887
Iteration 17, loss = 1.54582215
Iteration 18, loss = 1.53515562
Iteration 19, loss = 1.52350955
Iteration 20, loss = 1.49219230
Iteration 21, loss = 1.46865165
Iteration 22, loss = 1.44629163
Iteration 23, loss = 1.44294235
Iteration 24, loss = 1.42759388
Iteration 25, loss = 1.43300045
Iteration 26, loss = 1.43435465
Iteration 27, loss = 1.40706683
Iteration 28, loss = 1.44554948
Iteration 29, loss = 1.43023179
Iteration 30, loss = 1.39611950
Iteration 31, loss = 1.37193822
Iteration 32, los

In [0]:
clf = SGDClassifier(alpha=0.001, penalty='l2', loss='log', random_state=123 ,max_iter=10000, tol=0.00001)
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4232328070294782
Actual : 7
Probability : [[0.014  0.3576 0.0141 0.0218 0.0287 0.0253 0.5315 0.0023 0.0048]]
Predicted : [7]
[[ 43   3   0  31   3   2  21   0   0]
 [  8  10   0   7   0   0  67   0   0]
 [  6   0   4   7   1   0   6   0   0]
 [ 38   0   1  75   0   3  27   0   0]
 [ 10   1   0  13  18   2  16   0   0]
 [  8   1   2   5   0  22  13   0   0]
 [ 12   4   0  19   0   1 147   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  2   0   0   0   0   0   2   0   2]]
Score : 0.48270676691729325


In [0]:
from sklearn.linear_model import LogisticRegressionCV
clf =  LogisticRegressionCV()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.4047771684377421
Actual : 7
Probability : [[0.0157 0.4427 0.0092 0.0124 0.0314 0.0795 0.3988 0.0051 0.0052]]
Predicted : [2]
[[ 52   2   0  27   2   1  19   0   0]
 [  9  15   0   7   0   1  60   0   0]
 [  4   0   5   8   1   0   6   0   0]
 [ 38   3   1  74   2   2  24   0   0]
 [ 10   2   1  11  18   1  17   0   0]
 [  6   0   3   7   1  26   8   0   0]
 [ 12   5   1  16   0   2 147   0   0]
 [  0   0   0   1   0   0   1   0   0]
 [  1   0   0   0   0   0   2   0   3]]
Score : 0.5112781954887218


In [0]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
predict_and_plot_confusion_matrix(train_x, y_train, test_x, test_y, clf)

Training Done
Calibration Done
(665,)
<class 'numpy.ndarray'>
Log loss : 1.3986233675567277
Actual : 7
Probability : [[0.017  0.4358 0.0075 0.0141 0.0325 0.0424 0.4443 0.0017 0.0048]]
Predicted : [7]
[[ 52   3   0  27   1   2  18   0   0]
 [  8  18   0   5   0   1  60   0   0]
 [  6   0   5   6   1   0   6   0   0]
 [ 37   3   1  78   1   2  22   0   0]
 [ 13   1   1   9  19   1  16   0   0]
 [  6   1   3   7   1  25   8   0   0]
 [ 14   5   0  14   0   1 149   0   0]
 [  0   0   0   2   0   0   0   0   0]
 [  1   0   0   0   0   0   2   0   3]]
Score : 0.524812030075188
