## Classification with sklearn


In [163]:
import numpy as np
import pandas as pd
import re 

## loading a dataset

In [1]:
import urllib.request
import os

def download_file(url,local_file, force=False):
    """
    Helper function to download a file and store it locally
    """
    if not os.path.exists(local_file) or force:
        print('Downloading',url,'to',local_file)
        with urllib.request.urlopen(url) as opener, \
             open(local_file, mode='w', encoding='utf-8') as outfile:
                    outfile.write(opener.read().decode('utf-8'))
    else:
        print(local_file,'already downloaded')

In [2]:
train_file = 'news_en_train.txt'
train_url='http://www.esuli.it/demo/data/news_en_train.csv'
test_file = 'news_en_test.txt'
test_url = 'http://www.esuli.it/demo/data/news_en_test.csv'
delimiter = ','

download_file(train_url, train_file)
download_file(test_url, test_file)

Downloading http://www.esuli.it/demo/data/news_en_train.csv to news_en_train.txt
Downloading http://www.esuli.it/demo/data/news_en_test.csv to news_en_test.txt


In [14]:
import csv
x_train = []
y_train = []
with open(train_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter=delimiter)
    for row in reader:
        print(row)
        x_train.append(row[1])
        y_train.append(row[1])

x_test = list()
y_test = list()
with open(test_file, encoding='utf-8', newline='') as infile:
    reader = csv.reader(infile, delimiter=delimiter)
    for row in reader:
        x_test.append(row[0])
        y_test.append(row[1])


['Nigerian women\'s bobsled team make Winter Olympic history\r\r\nUpdated 1752 GMT (0152 HKT) November 17, 2017\r\r\nOlympic flame arrives in Seoul\r\r\nOlympic flame arrives in Seoul\r\r\nNigeria women\'s bobsled team qualify for Winter Olympics\r\r\nThey will be Africa\'s first ever Olympic representatives in the sport\r\r\nNo Nigerian has competed in a Winter Olympic event before\r\r\ncaptured hearts around the world, three women from Nigeria stand on the brink of making history of their own in the same event.\r\r\nDriver Seun Adigun and brakewomen Akuoma Omeoga, Ngozi Onwumere have qualified for\r\r\n, ensuring an African nation will be represented in the sport at the Winter Olympics for the first time ever.\r\r\nDespite numerous success stories in the summer Games -- notably gold in the men\'s football at\r\r\n-- no Nigerian has ever competed in a Winter Olympic event before.\r\r\nNow, after completing all five qualifying races, the Nigerian women\'s bobsled team are on their way 

IndexError: list index out of range

In [164]:

ids = pd.read_csv(r"../patents/gp-query2-plus.csv",index_col='code',skipinitialspace=True)
print(ids.isnull().sum())
ids['class'].value_counts()

id                                0
title                             0
assignee                         15
inventor/author                 383
priority date                   324
filing/creation date              6
publication date                  1
grant date                    13623
result link                       0
representative figure link    10002
code.1                            0
citations                         0
abstract                        796
class                          3384
dtype: int64


H04L63    2608
G06F21    2434
G06F16    1192
G06Q30     882
G06K9/     806
          ... 
F21S         1
C14B5/       1
A62C27       1
B62H3/       1
G07F5/       1
Name: class, Length: 906, dtype: int64

In [165]:
ids = ids[ids.abstract.notna()]
ids["class"].fillna(value="NA",inplace = True)
ids['class'] = ids['class'].astype('str').apply(lambda x: x[:3] if len(x) >= 3 else 'NA')
ids["class"] = ids["class"].apply(lambda x: 'NA' if x != 'G06' and x != 'H04' else x)
ids['abstract'].str.replace(r'[^\x00-\x7F]+', '')

print(ids.isnull().sum())
ids['class'].value_counts()

id                                0
title                             0
assignee                          5
inventor/author                 369
priority date                   315
filing/creation date              1
publication date                  0
grant date                    13565
result link                       0
representative figure link     9736
code.1                            0
citations                         0
abstract                          0
class                             0
dtype: int64


G06    9582
NA     6895
H04    5300
Name: class, dtype: int64

In [167]:
#use this if you want to try a binary classification
ids = ids[ids['class']!='NA']

In [168]:
from sklearn.model_selection import train_test_split

X = ids['abstract'].values
y = ids['class'].values

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [169]:
len(x_train),len(y_train),len(x_test),len(y_test)

(9970, 9970, 4912, 4912)

In [170]:
set(y_train)

{'G06', 'H04'}

In [171]:
sample_idx = 10
x_train[sample_idx]

'\r\nA priority queue including an order of local data relocation operations to be performed by a plurality of solid-state storage devices is maintained. An indication of a new local data relocation operation is received from a solid-state storage device of the plurality of solid-state storage devices for data stored at the solid-state storage device, the indication including information associated with the data. The new local data relocation operation is inserted into a position in the order of the priority queue based on the information associated with the data.\r\n'

In [172]:
y_train[sample_idx]

'G06'

# Binary classification

This is a multi-class single-label dataset.
We start with a simpler binary classification problem, e.g., economy vs not economy.

Just to make a choice, we use as the reference label the one of the example in the cell above.

In [173]:
import numpy as np

# numpy implements many useful and powerful vector manipulation tools
# here I'm using it to quickly create a True,False vector corresponding
# to the original values being equal to our label of interest or not
# i.e., binary labels

y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

(array([False,  True,  True, ...,  True, False, False]),
 array([False,  True,  True, ...,  True,  True,  True]))

## Building the pipeline by hand

In [174]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

## Tokenization

Try the following two cells removing the min_df parameter

In [175]:
vect = CountVectorizer(min_df=5)  # tokenization and frequency count

print('fit')
vect.fit(x_train)
print('transform')
X_train_tok = vect.transform(x_train)
print('done')

# the two steps above can be condensed in a single step that processes train
# data only once.

# print('fit_transform')
# X_train_tok = vect.fit_transform(x_train)
# print('done')

X_test_tok =vect.transform(x_test)

fit
transform
done


In [176]:
len(vect.vocabulary_)

7314

In [177]:
vect.vocabulary_

{'method': 3671,
 'of': 3951,
 'authenticating': 606,
 'client': 1065,
 'to': 5952,
 'server': 5316,
 'the': 5879,
 'having': 2766,
 'registered': 4851,
 'on': 3967,
 'by': 862,
 'storing': 5609,
 'therein': 5896,
 'valid': 6232,
 'identifier': 2881,
 'id': 2873,
 'and': 388,
 'hashed': 2761,
 'word': 6466,
 'generated': 2626,
 'applying': 450,
 'hash': 2760,
 'function': 2581,
 'disposable': 1872,
 'random': 4702,
 'variable': 6245,
 'possessed': 4377,
 'known': 3317,
 'both': 788,
 'concatenated': 1226,
 'with': 6460,
 'sequence': 5306,
 'resulting': 5033,
 'from': 2574,
 'hashing': 2763,
 'password': 4168,
 'said': 5152,
 'an': 370,
 'initialization': 3043,
 'disclosure': 1836,
 'provides': 4613,
 'scheme': 5196,
 'screening': 5211,
 'device': 1759,
 'medium': 3634,
 'electronic': 2042,
 'equipment': 2179,
 'based': 678,
 'artificial': 505,
 'intelligence': 3123,
 'comprises': 1210,
 'following': 2506,
 'steps': 5593,
 'acquiring': 185,
 'structural': 5631,
 'features': 2410,
 'cand

In [178]:
vect.get_feature_names()

['10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '108',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '118',
 '12',
 '120',
 '121',
 '122',
 '13',
 '130',
 '14',
 '140',
 '141',
 '15',
 '150',
 '155',
 '16',
 '160',
 '17',
 '170',
 '18',
 '180',
 '19',
 '190',
 '1a',
 '20',
 '200',
 '201',
 '202',
 '203',
 '204',
 '205',
 '206',
 '208',
 '21',
 '210',
 '22',
 '220',
 '23',
 '230',
 '24',
 '240',
 '25',
 '26',
 '260',
 '27',
 '28',
 '2a',
 '2d',
 '2nd',
 '30',
 '300',
 '301',
 '302',
 '304',
 '306',
 '31',
 '310',
 '32',
 '320',
 '33',
 '330',
 '34',
 '35',
 '36',
 '360',
 '3d',
 '3rd',
 '40',
 '400',
 '401',
 '402',
 '404',
 '410',
 '412',
 '42',
 '420',
 '4g',
 '4th',
 '50',
 '500',
 '510',
 '52',
 '520',
 '54',
 '56',
 '58',
 '5g',
 '5th',
 '60',
 '600',
 '602',
 '604',
 '606',
 '62',
 '64',
 '70',
 '802',
 '90',
 '95',
 'aa',
 'ability',
 'able',
 'abnormal',
 'abnormalities',
 'abnormality',
 'abnormity',
 'about',
 'above',
 'absence',
 'absent',
 

In [179]:
X_train_tok[0,:]

<1x7314 sparse matrix of type '<class 'numpy.int64'>'
	with 39 stored elements in Compressed Sparse Row format>

In [180]:
print(X_train_tok[0,:])

  (0, 370)	1
  (0, 388)	4
  (0, 450)	1
  (0, 606)	1
  (0, 788)	1
  (0, 862)	4
  (0, 1065)	5
  (0, 1226)	1
  (0, 1872)	2
  (0, 2574)	2
  (0, 2581)	1
  (0, 2626)	1
  (0, 2760)	1
  (0, 2761)	1
  (0, 2763)	1
  (0, 2766)	1
  (0, 2873)	1
  (0, 2881)	1
  (0, 3043)	1
  (0, 3317)	2
  (0, 3671)	1
  (0, 3951)	2
  (0, 3967)	1
  (0, 4168)	1
  (0, 4377)	2
  (0, 4702)	2
  (0, 4851)	1
  (0, 5033)	1
  (0, 5152)	1
  (0, 5306)	2
  (0, 5316)	3
  (0, 5609)	1
  (0, 5879)	7
  (0, 5896)	1
  (0, 5952)	2
  (0, 6232)	1
  (0, 6245)	2
  (0, 6460)	1
  (0, 6466)	1


Some scikit-learn modules implement an inverse_transform method to reconstruct input from their output.
Let's print out the feature names and their frequency for a document. Note that frequency info is lost.

In [181]:
vect.inverse_transform(X_train_tok[0,:])

[array(['an', 'and', 'applying', 'authenticating', 'both', 'by', 'client',
        'concatenated', 'disposable', 'from', 'function', 'generated',
        'hash', 'hashed', 'hashing', 'having', 'id', 'identifier',
        'initialization', 'known', 'method', 'of', 'on', 'password',
        'possessed', 'random', 'registered', 'resulting', 'said',
        'sequence', 'server', 'storing', 'the', 'therein', 'to', 'valid',
        'variable', 'with', 'word'], dtype='<U18')]

Let's attach frequency data to features

In [182]:
for feat,freq in zip(vect.inverse_transform(X_train_tok[0,:])[0],X_train_tok[0,:].data):
  print(feat,freq)

an 1
and 4
applying 1
authenticating 1
both 1
by 4
client 5
concatenated 1
disposable 2
from 2
function 1
generated 1
hash 1
hashed 1
hashing 1
having 1
id 1
identifier 1
initialization 1
known 2
method 1
of 2
on 1
password 1
possessed 2
random 2
registered 1
resulting 1
said 1
sequence 2
server 3
storing 1
the 7
therein 1
to 2
valid 1
variable 2
with 1
word 1


## Feature selection

This is the first element where we use the labels, because it is a supervised method.

In [183]:
bin_sel = SelectKBest(chi2, k=5000)  # feature selection
bin_sel.fit(X_train_tok,y_train_bin)
X_train_sel_bin = bin_sel.transform(X_train_tok)
X_test_sel_bin = bin_sel.transform(X_test_tok)

In [184]:
bin_sel.get_support()

array([False,  True, False, ..., False,  True,  True])

In [185]:
X_train_sel_bin

<9970x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 472077 stored elements in Compressed Sparse Row format>

In [186]:
X_train_sel_bin[0,:]

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [187]:
print(X_train_sel_bin[0,:])

  (0, 233)	1
  (0, 242)	4
  (0, 386)	1
  (0, 521)	1
  (0, 727)	5
  (0, 827)	1
  (0, 1279)	2
  (0, 1734)	2
  (0, 1739)	1
  (0, 1770)	1
  (0, 1853)	1
  (0, 1856)	1
  (0, 1931)	1
  (0, 1936)	1
  (0, 2045)	1
  (0, 2231)	2
  (0, 2478)	1
  (0, 2664)	2
  (0, 2676)	1
  (0, 2820)	1
  (0, 2969)	2
  (0, 3201)	2
  (0, 3308)	1
  (0, 3515)	1
  (0, 3623)	2
  (0, 3630)	3
  (0, 3844)	1
  (0, 4037)	7
  (0, 4275)	1
  (0, 4284)	2
  (0, 4432)	1
  (0, 4437)	1


The feature selection module has an inverse transform method so that we can map selected feature back to the original large feature space

In [188]:
bin_sel.inverse_transform(X_train_sel_bin[0,:])

<1x7314 sparse matrix of type '<class 'numpy.int64'>'
	with 32 stored elements in Compressed Sparse Column format>

In [189]:
print(vect.inverse_transform(bin_sel.inverse_transform(X_train_sel_bin[0,:])))

[array(['an', 'and', 'authenticating', 'both', 'client', 'concatenated',
       'disposable', 'from', 'function', 'generated', 'hashed', 'having',
       'id', 'identifier', 'initialization', 'known', 'method', 'of',
       'on', 'password', 'possessed', 'random', 'registered', 'said',
       'sequence', 'server', 'storing', 'the', 'valid', 'variable',
       'with', 'word'], dtype='<U18')]


## Weighting

In [190]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel_bin)
X_train_vec_bin = tfidf.transform(X_train_sel_bin)
X_test_vec_bin =tfidf.transform(X_test_sel_bin)

In [191]:
print(X_train_vec_bin[0,:])

  (0, 4437)	0.11870564192152047
  (0, 4432)	0.03833626718717423
  (0, 4284)	0.2625343359275649
  (0, 4275)	0.13508789011625466
  (0, 4037)	0.1615430428113228
  (0, 3844)	0.0893228356278042
  (0, 3630)	0.21245865130565886
  (0, 3623)	0.20997711320361134
  (0, 3515)	0.10750602335397783
  (0, 3308)	0.12316213163426466
  (0, 3201)	0.2361828915751481
  (0, 2969)	0.35492603937440637
  (0, 2820)	0.1294482875948872
  (0, 2676)	0.03462464102940085
  (0, 2664)	0.048282834564967736
  (0, 2478)	0.037807885336414
  (0, 2231)	0.21577911550133275
  (0, 2045)	0.1597347952655581
  (0, 1936)	0.10594457754762421
  (0, 1931)	0.12685519123993202
  (0, 1856)	0.08260477115883841
  (0, 1853)	0.17746301968720318
  (0, 1770)	0.07621898659605944
  (0, 1739)	0.0917550344616764
  (0, 1734)	0.083011263882389
  (0, 1279)	0.36404607039777825
  (0, 827)	0.18202303519888913
  (0, 727)	0.42758432389411427
  (0, 521)	0.10788955775066637
  (0, 386)	0.12116912111396187
  (0, 242)	0.09442730423021907
  (0, 233)	0.0347519675

In [192]:
for feat,weight,freq in zip(vect.inverse_transform(bin_sel.inverse_transform(X_train_vec_bin[0,:]))[0],X_train_vec_bin[0,:].data,X_train_sel_bin[0,:].data):
  print(feat,weight,freq)

an 0.11870564192152047 1
and 0.03833626718717423 4
authenticating 0.2625343359275649 1
both 0.13508789011625466 1
client 0.1615430428113228 5
concatenated 0.0893228356278042 1
disposable 0.21245865130565886 2
from 0.20997711320361134 2
function 0.10750602335397783 1
generated 0.12316213163426466 1
hashed 0.2361828915751481 1
having 0.35492603937440637 1
id 0.1294482875948872 1
identifier 0.03462464102940085 1
initialization 0.048282834564967736 1
known 0.037807885336414 2
method 0.21577911550133275 1
of 0.1597347952655581 2
on 0.10594457754762421 1
password 0.12685519123993202 1
possessed 0.08260477115883841 2
random 0.17746301968720318 2
registered 0.07621898659605944 1
said 0.0917550344616764 1
sequence 0.083011263882389 2
server 0.36404607039777825 3
storing 0.18202303519888913 1
the 0.42758432389411427 7
valid 0.10788955775066637 1
variable 0.12116912111396187 2
with 0.09442730423021907 1
word 0.03475196759185327 1


## Learning algorithm

In [193]:
svm_bin = LinearSVC()  # linear svm with default parameters
svm_bin_clf = svm_bin.fit(X_train_vec_bin,y_train_bin)
bin_predictions = svm_bin_clf.predict(X_test_vec_bin)

In [194]:
len(bin_predictions)

4912

In [195]:
bin_predictions

array([False,  True,  True, ...,  True, False, False])

## Evaluation of accuracy

In [196]:
correct = 0
for prediction,true_label in zip(bin_predictions, y_test_bin):
    if prediction==true_label:
        correct += 1
print(correct/len(bin_predictions))

0.7748371335504886


## Using sklearn pipeline object

In [197]:
bin_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

bin_pipeline.fit(x_train,y_train_bin)
bin_predictions = bin_pipeline.predict(x_test)
correct = 0
for prediction,true_label in zip(bin_predictions, y_test_bin):
    if prediction==true_label:
        correct += 1
print(correct/len(bin_predictions))

0.7734120521172638


In [198]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test_bin, bin_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test_bin, bin_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

       False       0.70      0.63      0.66      1718
        True       0.81      0.85      0.83      3194

    accuracy                           0.77      4912
   macro avg       0.75      0.74      0.74      4912
weighted avg       0.77      0.77      0.77      4912

Confusion matrix:
[[1076  642]
 [ 471 2723]]


## Inspecting the pipeline

We can have a look at the parameters of the supervised method of the pipeline to understand how it determines its classification decisions.



In [149]:
tokenizer = bin_pipeline.named_steps['vect']
selector = bin_pipeline.named_steps['sel']
classifier = bin_pipeline.named_steps['learner']

First we look at the feature selection function.
We get the chi^2 score assigned to every feature.

In [150]:
feature_names = tokenizer.get_feature_names()
feats_w_score = list()
for index,(selected,score) in enumerate(zip(selector.get_support(),selector.scores_)):
    feats_w_score.append((score,selected,feature_names[index]))
feats_w_score = sorted(feats_w_score)
len(feats_w_score)

28921

This are the 100 less and most informative features

In [151]:
feats_w_score[:100],feats_w_score[-100:]

([(1.0499603702783994e-05, False, 'prior'),
  (1.5603029105104712e-05, False, 'electromagnetic'),
  (1.5603029105104712e-05, False, 'operative'),
  (1.5603029105104712e-05, False, 'therefrom'),
  (5.6638861181300644e-05, False, 'concurrently'),
  (5.6638861181300644e-05, False, 'hiding'),
  (5.6638861181300644e-05, False, 'lightweight'),
  (5.6638861181300644e-05, False, 'limitation'),
  (5.6638861181300644e-05, False, 'localized'),
  (5.6638861181300644e-05, False, 'rack'),
  (5.6638861181300644e-05, False, '단말로'),
  (0.00011327772236260129, False, 'lens'),
  (0.00019441232471464838, False, 'also'),
  (0.00027721326214418044, False, '220'),
  (0.00027721326214418044, False, 'allocate'),
  (0.00027721326214418044, False, 'ausgewählten'),
  (0.00027721326214418044, False, 'digit'),
  (0.00027721326214418044, False, 'elapsed'),
  (0.00027721326214418044, False, 'games'),
  (0.00027721326214418044, False, 'hazard'),
  (0.00027721326214418044, False, 'infer'),
  (0.00027721326214418044, Fa

Then we look at the parameters of the linear classification model.
Values with highest absolute values are those which contribute the most to the classification decision. Values close to zero are less important.

In [152]:
feats_w_classifier_weight = list()
for index,weight in enumerate(selector.inverse_transform(classifier.coef_)[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

4957

These are the feature that most contribute to a positive decision

In [153]:
feats_w_classifier_weight[-100:]

[(1.046849775115165, 'piece'),
 (1.0495550540567105, 'qualifying'),
 (1.050692904816471, 'discriminator'),
 (1.0525053168805685, 'applied'),
 (1.0531476053263282, '100'),
 (1.062007752758507, 'avatar'),
 (1.0628936765716415, 'cues'),
 (1.0629870098807936, '악성'),
 (1.067159085016332, 'conducting'),
 (1.068401708255899, 'coordinate'),
 (1.070556278778731, 'forecast'),
 (1.0706442360854953, 'discussions'),
 (1.0720620007353354, 'lineage'),
 (1.0727234118520628, 'resolves'),
 (1.0752503407854401, 'financial'),
 (1.082391633074612, 'weather'),
 (1.089101314671491, 'capabilities'),
 (1.0920794592038665, 'error'),
 (1.0938383851479576, 'intensive'),
 (1.1007260741550586, 'bids'),
 (1.1039886330620845, 'nfa'),
 (1.1056089621470522, '발명의'),
 (1.1124838129169936, 'sentence'),
 (1.1137933891997394, 'sampled'),
 (1.1148374997696056, 'wind'),
 (1.1193506245270355, 'task'),
 (1.1245757792747262, 'bid'),
 (1.1254209580004009, 'alignment'),
 (1.1284555830484946, 'autonomic'),
 (1.1332751134915608, 'wo

These are the features that most contribute to a negative decision.

In [154]:
feats_w_classifier_weight[:100]

[(-1.9064797012468377, 'television'),
 (-1.8817840845028113, 'shooting'),
 (-1.8339057503610863, 'fingerprinting'),
 (-1.7727677154037347, 'tv'),
 (-1.7208736293688627, 'decoy'),
 (-1.7092549260513634, 'injected'),
 (-1.6353883879856157, 'authorisation'),
 (-1.628043983343766, 'cyberthreat'),
 (-1.6099334212393934, 'motions'),
 (-1.595772506556672, 'home'),
 (-1.5859559835569765, 'honeypot'),
 (-1.5792496060106833, 'phishing'),
 (-1.575766369249608, 'characterizes'),
 (-1.5724686697142638, 'publicly'),
 (-1.5703413626095286, 'encrypting'),
 (-1.5471414699498403, 'correlated'),
 (-1.5311146561026545, 'network'),
 (-1.5303432736052838, 'endpoints'),
 (-1.5027572319277338, 'telecommunications'),
 (-1.5025349012455622, 'multifunctional'),
 (-1.5012197241055076, 'username'),
 (-1.4909574423159204, 'intrusion'),
 (-1.4352400462281316, 'viewing'),
 (-1.4153820691952113, 'tendency'),
 (-1.4044368750644387, 'characterizations'),
 (-1.3944801035782146, '360'),
 (-1.3810665822911483, 'remedial'),

## Testing other classifiers

### Decision tree

In [155]:
dt_bin_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', DecisionTreeClassifier())  # learning algorithm
])

dt_bin_pipeline.fit(x_train,y_train_bin)
bin_predictions = dt_bin_pipeline.predict(x_test)

print('Classification report:')
print(classification_report(y_test_bin, bin_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test_bin, bin_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

       False       0.54      0.56      0.55      1718
        True       0.76      0.75      0.75      3194

    accuracy                           0.68      4912
   macro avg       0.65      0.65      0.65      4912
weighted avg       0.68      0.68      0.68      4912

Confusion matrix:
[[ 957  761]
 [ 810 2384]]


We can try to visualize the tree, but there are too many dimension to have a structure that is really inspectable (I'm referring to the font size, but to the number of nodes of the tree!).

DT visualization works on low dimensional data (see https://scikit-learn.org/stable/modules/tree.html#classification)

In [88]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(24, 24))
plot_tree(dt_bin_pipeline.named_steps['learner'])
plt.show()

In C:\Users\carlo\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The text.latex.preview rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\carlo\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The mathtext.fallback_to_cm rcparam was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\carlo\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: Support for setting the 'mathtext.fallback_to_cm' rcParam is deprecated since 3.3 and will be removed two minor releases later; use 'mathtext.fallback : 'cm' instead.
In C:\Users\carlo\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mplstyle: 
The validate_bool_maybe_none function was deprecated in Matplotlib 3.3 and will be removed two minor releases later.
In C:\Users\carlo\Anaconda3\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test.mpl

<Figure size 2400x2400 with 1 Axes>

### Naive Bayes

NB uses a multinomial model based on term frequencies, we can skip the tfidf module.

In [156]:
nb_bin_pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('learner', MultinomialNB())  # learning algorithm
])

nb_bin_pipeline.fit(x_train,y_train_bin)
bin_predictions = nb_bin_pipeline.predict(x_test)

print('Classification report:')
print(classification_report(y_test_bin, bin_predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test_bin, bin_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

       False       0.65      0.76      0.70      1718
        True       0.86      0.78      0.81      3194

    accuracy                           0.77      4912
   macro avg       0.75      0.77      0.76      4912
weighted avg       0.78      0.77      0.77      4912

Confusion matrix:
[[1299  419]
 [ 710 2484]]


In [157]:
tokenizer = nb_bin_pipeline.named_steps['vect']
selector = nb_bin_pipeline.named_steps['sel']
classifier = nb_bin_pipeline.named_steps['learner']


NB model stores log values of priors and likelihoods

In [158]:
classifier.class_log_prior_,classifier.feature_log_prob_, len(classifier.feature_log_prob_[0])

(array([-1.02365928, -0.44515935]),
 array([[ -9.42161213, -10.84872849,  -8.83382547, ..., -11.2541936 ,
         -12.64048796, -11.03105004],
        [ -8.67357467, -12.14967336, -10.02940982, ..., -13.24828564,
         -10.94570055, -13.24828564]]),
 5000)

In NB a key factor for decision is the ratio between the likelihood for positive and negative decision.

The next cell exploits numpy to perform element-by-element division between log probabilities of p(w|class=1) and p(w|class=0), producing a vector of such ratios.

In [159]:
ratio = classifier.feature_log_prob_[0]/classifier.feature_log_prob_[1]

In [160]:
feats_w_classifier_weight = list()
feature_names = tokenizer.get_feature_names()
for index,weight in enumerate(selector.inverse_transform([ratio])[0]):
    if weight!=0:
        feats_w_classifier_weight.append((weight,feature_names[index]))
feats_w_classifier_weight = sorted(feats_w_classifier_weight)
len(feats_w_classifier_weight)

5000

This are the most relevant features for a positive decision

In [161]:
feats_w_classifier_weight[-100::-1]

[(1.25523306617932, 'ideas'),
 (1.25523306617932, 'datei'),
 (1.25523306617932, 'bids'),
 (1.2544216308021805, 'study'),
 (1.2538383267897504, 'pedestrian'),
 (1.2538383267897504, 'money'),
 (1.2538383267897504, 'gpu'),
 (1.2516525692595708, 'commodity'),
 (1.2514321067899263, 'subjects'),
 (1.251143407987406, 'topic'),
 (1.2499504262986565, '카드'),
 (1.2499504262986565, '마케팅'),
 (1.2499504262986565, '뉴스'),
 (1.2499504262986565, 'tf'),
 (1.2499504262986565, 'stroke'),
 (1.2499504262986565, 'idf'),
 (1.2499504262986565, 'generative'),
 (1.2497994685765097, 'column'),
 (1.249430744282014, 'neural'),
 (1.2477907700971618, 'natural'),
 (1.2464981546344915, 'particle'),
 (1.2464981546344915, 'incremental'),
 (1.2464981546344915, 'fund'),
 (1.2444801920497817, 'unclassified'),
 (1.2444801920497817, 'singleton'),
 (1.2444801920497817, 'paragraph'),
 (1.2444801920497817, 'modality'),
 (1.2444801920497817, 'hobby'),
 (1.2444801920497817, 'gan'),
 (1.2444801920497817, 'fitness'),
 (1.244480192049

These are the most relevat features for a negative decision.

In [162]:
feats_w_classifier_weight[:100]

[(0.6925237196931254, 'el'),
 (0.6949201579318467, 'dispositivo'),
 (0.6973951817729916, '트래픽'),
 (0.6999541205358534, 'decoy'),
 (0.7026028647397787, 'cuenta'),
 (0.7053479477753017, 'sidelink'),
 (0.7111570798550793, 'grip'),
 (0.7145524203019042, 'caller'),
 (0.7208061299243305, 'cyberthreat'),
 (0.7208061299243305, 'unsymmetrical'),
 (0.7243175288241765, '가전제품'),
 (0.7277997905604594, '기관'),
 (0.7280002818596109, 'lpwan'),
 (0.7280002818596109, 'wlan'),
 (0.7314601726774573, 'interworking'),
 (0.7318719748292674, 'mc'),
 (0.7358347129889187, 'institutional'),
 (0.7359530478619208, 'forwarder'),
 (0.7359530478619208, 'los'),
 (0.7359530478619208, 'sip'),
 (0.7359530478619208, 'tether'),
 (0.7359530478619208, 'una'),
 (0.7359530478619208, '엣지'),
 (0.7367777092131462, '고유'),
 (0.7402042807499921, 'firewall'),
 (0.7402674486361976, 'modem'),
 (0.7402674486361976, 'uso'),
 (0.7402674486361976, 'waps'),
 (0.7402674486361976, '분할된'),
 (0.7448434838641423, 'callers'),
 (0.7448434838641423,

# Multi-class single-label classification

Tokenization does not change from the binary problem, as the dataset is the same.

## Feature selection

Here we use single-label labels

In [96]:
sel = SelectKBest(chi2, k=5000)  # feature selection
sel.fit(X_train_tok,y_train)
X_train_sel = sel.transform(X_train_tok)
X_test_sel = sel.transform(X_test_tok)

In [97]:
sel.get_support()

array([ True,  True, False, ..., False, False,  True])

In [98]:
X_train_sel

<14590x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 736531 stored elements in Compressed Sparse Row format>

In [99]:
X_train_sel[0,:]

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [100]:
print(X_train_sel[0,:])

  (0, 132)	2
  (0, 147)	1
  (0, 230)	2
  (0, 250)	1
  (0, 264)	3
  (0, 313)	1
  (0, 358)	1
  (0, 422)	3
  (0, 424)	1
  (0, 438)	1
  (0, 480)	2
  (0, 488)	1
  (0, 905)	1
  (0, 907)	1
  (0, 934)	2
  (0, 990)	1
  (0, 1135)	3
  (0, 1240)	1
  (0, 1289)	1
  (0, 1497)	1
  (0, 1559)	1
  (0, 1621)	1
  (0, 1814)	1
  (0, 1956)	1
  (0, 2003)	1
  :	:
  (0, 2662)	1
  (0, 2685)	1
  (0, 2812)	2
  (0, 2841)	3
  (0, 2864)	1
  (0, 3166)	2
  (0, 3168)	1
  (0, 3246)	1
  (0, 3415)	1
  (0, 3473)	1
  (0, 3497)	1
  (0, 3703)	2
  (0, 3734)	1
  (0, 3759)	1
  (0, 3763)	1
  (0, 4133)	6
  (0, 4134)	1
  (0, 4213)	10
  (0, 4250)	1
  (0, 4454)	7
  (0, 4456)	1
  (0, 4527)	1
  (0, 4621)	1
  (0, 4636)	1
  (0, 4650)	1


Selected feature differ from the binary case, as now they have to be informative with respect to a different set of labels.

In [101]:
print(vect.inverse_transform(sel.inverse_transform(X_train_sel[0,:])))

[array(['activities', 'additional', 'also', 'an', 'and', 'appearance',
       'as', 'authentication', 'authenticity', 'average', 'based', 'be',
       'compute', 'computer', 'confidence', 'context', 'data',
       'determine', 'disclosed', 'engine', 'example', 'face', 'gait',
       'heuristic', 'how', 'in', 'include', 'is', 'made', 'may', 'method',
       'monitoring', 'more', 'movements', 'on', 'or', 'other',
       'predictive', 'preemptively', 'programmed', 'receives',
       'regarding', 'relevant', 'score', 'security', 'sensitive',
       'sensors', 'system', 'systems', 'the', 'thus', 'user', 'users',
       'vision', 'when', 'will', 'with'], dtype='<U23')]


## Weighting

In [102]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [103]:
print(X_train_vec[0,:])

  (0, 4650)	0.034272298568714096
  (0, 4636)	0.09441563442789391
  (0, 4621)	0.06222415493203917
  (0, 4527)	0.12091226230577407
  (0, 4456)	0.08109826660544252
  (0, 4454)	0.34343175641297063
  (0, 4250)	0.10444083330306983
  (0, 4213)	0.2098293130559417
  (0, 4134)	0.0583023914838006
  (0, 4133)	0.22569164369898215
  (0, 3763)	0.10112786007181591
  (0, 3759)	0.1133982148558292
  (0, 3734)	0.06417650906055922
  (0, 3703)	0.1845377564700082
  (0, 3497)	0.09815259091710517
  (0, 3473)	0.10838838490452433
  (0, 3415)	0.08428770084936733
  (0, 3246)	0.12556817813384408
  (0, 3168)	0.17856473617256763
  (0, 3166)	0.25444401455767124
  (0, 2864)	0.06025679643819927
  (0, 2841)	0.11318451484390893
  (0, 2812)	0.06373517689641844
  (0, 2685)	0.1471623486233739
  (0, 2662)	0.05262046952507982
  :	:
  (0, 2003)	0.11968098028110531
  (0, 1956)	0.15067277433141268
  (0, 1814)	0.16922589454070208
  (0, 1621)	0.09870197780775031
  (0, 1559)	0.08125720457436793
  (0, 1497)	0.09539358743868143
  (0, 

In [104]:
for feat,weight in zip(vect.inverse_transform(sel.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data):
  print(feat,weight)

activities 0.034272298568714096
additional 0.09441563442789391
also 0.06222415493203917
an 0.12091226230577407
and 0.08109826660544252
appearance 0.34343175641297063
as 0.10444083330306983
authentication 0.2098293130559417
authenticity 0.0583023914838006
average 0.22569164369898215
based 0.10112786007181591
be 0.1133982148558292
compute 0.06417650906055922
computer 0.1845377564700082
confidence 0.09815259091710517
context 0.10838838490452433
data 0.08428770084936733
determine 0.12556817813384408
disclosed 0.17856473617256763
engine 0.25444401455767124
example 0.06025679643819927
face 0.11318451484390893
gait 0.06373517689641844
heuristic 0.1471623486233739
how 0.05262046952507982
in 0.07234580545317308
include 0.03557971313596843
is 0.22413141851956067
made 0.09217845430916606
may 0.08278330533398488
method 0.061691253819134
monitoring 0.026788405829879864
more 0.11968098028110531
movements 0.15067277433141268
on 0.16922589454070208
or 0.09870197780775031
other 0.08125720457436793
pred

## Learning algorithm

Linear SVM implement multi-class single-label using a one-vs-rest approach

In [105]:
learner = LinearSVC()  # linear svm with default parameters
classifier = learner.fit(X_train_vec,y_train)
predictions = classifier.predict(X_test_vec)

In [106]:
len(predictions)

7187

In [107]:
predictions

array(['H04', 'NA', 'G06', ..., 'NA', 'NA', 'NA'], dtype=object)

## Evaluation of accuracy

In [108]:
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

0.6847085014609712


## Using sklearn pipeline object

In [109]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

0.6851259218032559


In [110]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

         G06       0.67      0.75      0.71      3149
         H04       0.64      0.57      0.61      1767
          NA       0.73      0.68      0.71      2271

    accuracy                           0.69      7187
   macro avg       0.68      0.67      0.67      7187
weighted avg       0.69      0.69      0.68      7187

Confusion matrix:
[[2355  406  388]
 [ 576 1014  177]
 [ 560  156 1555]]


The classification score for the binary classifier we learned earlier is different, though it is trained on exactly the same data. Why?

We try a linear svm with one-vs-one model.

LinearSVC does not implement OvO.

We can wrap it into a OneVsOneClassifier that can be applied to any classifier.

(Note that other classifiers natively implement OvO, e.g., sklearn.svm.SVC)

In [111]:
from sklearn.multiclass import OneVsOneClassifier

pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=5000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', OneVsOneClassifier(LinearSVC()))  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)

In [112]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

         G06       0.67      0.75      0.71      3149
         H04       0.65      0.58      0.61      1767
          NA       0.73      0.68      0.71      2271

    accuracy                           0.68      7187
   macro avg       0.68      0.67      0.67      7187
weighted avg       0.69      0.68      0.68      7187

Confusion matrix:
[[2348  401  400]
 [ 580 1022  165]
 [ 569  152 1550]]


# Saving classifiers

Fitted classifiers (both single object and pipelines), as any scikit object, can be saved and the load for successive reuse.

NOTE: saving a file on Colab saves it on the temporary virtual machine on the cloud, to get a persistent copy additional code is require see https://colab.research.google.com/notebooks/io.ipynb

In [None]:
import pickle

In [None]:
with open('news_en_classifier.pkl',mode='bw') as outputfile:
  pickle.dump(pipeline,outputfile)

In [None]:
with open('news_en_classifier.pkl',mode='br') as inputfile:
  pipeline = pickle.load(inputfile)

In [None]:
pipeline

In [None]:
from google.colab import files

files.download('news_en_classifier.pkl')

In [None]:
files.upload()

In [None]:
with open('news_en_classifier (1).pkl',mode='br') as inputfile:
  pipeline2 = pickle.load(inputfile)
pipeline2