## Classification with sklearn


In [1]:
import numpy as np
import pandas as pd
import re 

## loading a dataset

In [3]:

ids = pd.read_csv(r"../patents/gp-query2-plus.csv",index_col='code',skipinitialspace=True)
print(ids.isnull().sum())
ids['class'].value_counts()

id                                0
title                             0
assignee                         15
inventor/author                 383
priority date                   324
filing/creation date              6
publication date                  1
grant date                    13623
result link                       0
representative figure link    10002
code.1                            0
citations                         0
abstract                        796
class                          3384
dtype: int64


H04L63    2608
G06F21    2434
G06F16    1192
G06Q30     882
G06K9/     806
          ... 
B41M3/       1
B64D43       1
H01J9/       1
B41L21       1
H02B13       1
Name: class, Length: 906, dtype: int64

In [4]:
ids = ids[ids.abstract.notna()]
ids["class"].fillna(value="NA",inplace = True)
ids['class'] = ids['class'].astype('str').apply(lambda x: x[:3] if len(x) >= 3 else 'NA')
ids["class"] = ids["class"].apply(lambda x: 'NA' if x != 'G06' and x != 'H04' else x)
ids['abstract'].str.replace(r'[^\x00-\x7F]+', '') #dorp chinese character

print(ids.isnull().sum())
ids['class'].value_counts()

id                                0
title                             0
assignee                          5
inventor/author                 369
priority date                   315
filing/creation date              1
publication date                  0
grant date                    13565
result link                       0
representative figure link     9736
code.1                            0
citations                         0
abstract                          0
class                             0
dtype: int64


G06    9582
NA     6895
H04    5300
Name: class, dtype: int64

In [5]:
#use this if you want to try a binary classification
ids = ids[ids['class']!='NA']

In [5]:
from sklearn.model_selection import train_test_split

X = ids['abstract'].values
y = ids['class'].values

x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)


In [6]:
len(x_train),len(y_train),len(x_test),len(y_test)

(15243, 15243, 6534, 6534)

In [7]:
set(y_train)

{'G06', 'H04', 'NA'}

In [8]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

## Tokenization

Try the following two cells removing the min_df parameter

In [9]:
vect = CountVectorizer(min_df=5)  # tokenization and frequency count

print('fit')
vect.fit(x_train)
print('transform')
X_train_tok = vect.transform(x_train)
print('done')

# the two steps above can be condensed in a single step that processes train
# data only once.

# print('fit_transform')
# X_train_tok = vect.fit_transform(x_train)
# print('done')

X_test_tok =vect.transform(x_test)

fit
transform
done


In [10]:
len(vect.vocabulary_)

10022

In [11]:
vect.vocabulary_

{'the': 8139,
 'disclosed': 2564,
 'computer': 1748,
 'implemented': 4115,
 'method': 5132,
 'for': 3534,
 'detecting': 2429,
 'unauthorized': 8476,
 'data': 2199,
 'shares': 7346,
 'may': 5053,
 'include': 4157,
 'providing': 6398,
 'user': 8604,
 'of': 5536,
 'an': 523,
 'anonymized': 586,
 'inbox': 4146,
 'with': 8963,
 'email': 2905,
 'to': 8236,
 'use': 8597,
 'particular': 5782,
 'online': 5571,
 'entity': 3043,
 'identifying': 4058,
 'one': 5567,
 'or': 5628,
 'more': 5263,
 'emails': 2906,
 'sent': 7291,
 'from': 3611,
 'different': 2487,
 'entities': 3041,
 'that': 8138,
 'are': 680,
 'determining': 2441,
 'based': 956,
 'on': 5564,
 'having': 3888,
 'been': 990,
 'by': 1254,
 'has': 3881,
 'shared': 7345,
 'other': 5663,
 'and': 551,
 'creating': 2090,
 'privacy': 6253,
 'score': 7175,
 'at': 774,
 'least': 4745,
 'in': 4139,
 'part': 5769,
 'determination': 2435,
 'various': 8658,
 'methods': 5135,
 'systems': 7996,
 'readable': 6567,
 'media': 5073,
 'also': 492,
 'present'

In [12]:
vect.get_feature_names()

['10',
 '100',
 '1000',
 '101',
 '102',
 '103',
 '104',
 '105',
 '106',
 '107',
 '108',
 '11',
 '110',
 '111',
 '112',
 '113',
 '114',
 '115',
 '116',
 '118',
 '119',
 '12',
 '120',
 '121',
 '122',
 '123',
 '124',
 '125',
 '13',
 '130',
 '131',
 '132',
 '133',
 '135',
 '14',
 '140',
 '142',
 '144',
 '145',
 '15',
 '150',
 '151',
 '153',
 '155',
 '16',
 '160',
 '17',
 '170',
 '18',
 '180',
 '19',
 '190',
 '1a',
 '1b',
 '1차',
 '20',
 '200',
 '2004',
 '2005',
 '2007',
 '201',
 '2011',
 '202',
 '2021',
 '203',
 '204',
 '205',
 '206',
 '208',
 '21',
 '210',
 '212',
 '214',
 '215',
 '216',
 '22',
 '220',
 '22a',
 '23',
 '230',
 '24',
 '240',
 '25',
 '250',
 '26',
 '260',
 '27',
 '28',
 '29',
 '2a',
 '2d',
 '2nd',
 '2차',
 '30',
 '300',
 '301',
 '302',
 '303',
 '304',
 '305',
 '306',
 '31',
 '310',
 '311',
 '312',
 '314',
 '32',
 '320',
 '33',
 '330',
 '34',
 '35',
 '350',
 '36',
 '360',
 '38',
 '3a',
 '3b',
 '3d',
 '3g',
 '3rd',
 '3차원',
 '40',
 '400',
 '401',
 '402',
 '404',
 '41',
 '410',
 '

In [13]:
X_train_tok[0,:]

<1x10022 sparse matrix of type '<class 'numpy.int64'>'
	with 60 stored elements in Compressed Sparse Row format>

In [14]:
print(X_train_tok[0,:])

  (0, 492)	1
  (0, 523)	2
  (0, 551)	2
  (0, 586)	1
  (0, 680)	2
  (0, 774)	1
  (0, 956)	2
  (0, 990)	1
  (0, 1254)	1
  (0, 1748)	2
  (0, 2090)	1
  (0, 2199)	1
  (0, 2429)	1
  (0, 2435)	1
  (0, 2441)	1
  (0, 2487)	3
  (0, 2564)	2
  (0, 2905)	4
  (0, 2906)	2
  (0, 3041)	4
  (0, 3043)	5
  (0, 3534)	3
  (0, 3611)	2
  (0, 3881)	2
  (0, 3888)	1
  :	:
  (0, 5263)	3
  (0, 5536)	1
  (0, 5564)	2
  (0, 5567)	3
  (0, 5571)	5
  (0, 5628)	3
  (0, 5663)	3
  (0, 5769)	1
  (0, 5782)	5
  (0, 6253)	1
  (0, 6398)	1
  (0, 6567)	1
  (0, 7175)	1
  (0, 7291)	2
  (0, 7345)	2
  (0, 7346)	1
  (0, 7996)	1
  (0, 8138)	3
  (0, 8139)	11
  (0, 8236)	2
  (0, 8476)	1
  (0, 8597)	1
  (0, 8604)	3
  (0, 8658)	1
  (0, 8963)	3


Some scikit-learn modules implement an inverse_transform method to reconstruct input from their output.
Let's print out the feature names and their frequency for a document. Note that frequency info is lost.

In [15]:
vect.inverse_transform(X_train_tok[0,:])

[array(['also', 'an', 'and', 'anonymized', 'are', 'at', 'based', 'been',
        'by', 'computer', 'creating', 'data', 'detecting', 'determination',
        'determining', 'different', 'disclosed', 'email', 'emails',
        'entities', 'entity', 'for', 'from', 'has', 'having',
        'identifying', 'implemented', 'in', 'inbox', 'include', 'least',
        'may', 'media', 'method', 'methods', 'more', 'of', 'on', 'one',
        'online', 'or', 'other', 'part', 'particular', 'privacy',
        'providing', 'readable', 'score', 'sent', 'shared', 'shares',
        'systems', 'that', 'the', 'to', 'unauthorized', 'use', 'user',
        'various', 'with'], dtype='<U23')]

Let's attach frequency data to features

In [16]:
for feat,freq in zip(vect.inverse_transform(X_train_tok[0,:])[0],X_train_tok[0,:].data):
  print(feat,freq)

also 1
an 2
and 2
anonymized 1
are 2
at 1
based 2
been 1
by 1
computer 2
creating 1
data 1
detecting 1
determination 1
determining 1
different 3
disclosed 2
email 4
emails 2
entities 4
entity 5
for 3
from 2
has 2
having 1
identifying 1
implemented 1
in 1
inbox 1
include 1
least 1
may 1
media 1
method 1
methods 1
more 3
of 1
on 2
one 3
online 5
or 3
other 3
part 1
particular 5
privacy 1
providing 1
readable 1
score 1
sent 2
shared 2
shares 1
systems 1
that 3
the 11
to 2
unauthorized 1
use 1
user 3
various 1
with 3


## Feature selection

This is the first element where we use the labels, because it is a supervised method.

# Multi-class single-label classification

Tokenization does not change from the binary problem, as the dataset is the same.

## Feature selection

Here we use single-label labels

In [15]:
sel = SelectKBest(chi2, k=5000)  # feature selection
sel.fit(X_train_tok,y_train)
X_train_sel = sel.transform(X_train_tok)
X_test_sel = sel.transform(X_test_tok)

In [16]:
sel.get_support()

array([ True,  True, False, ..., False, False,  True])

In [17]:
X_train_sel

<14590x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 736531 stored elements in Compressed Sparse Row format>

In [18]:
X_train_sel[0,:]

<1x5000 sparse matrix of type '<class 'numpy.int64'>'
	with 57 stored elements in Compressed Sparse Row format>

In [19]:
print(X_train_sel[0,:])

  (0, 132)	2
  (0, 147)	1
  (0, 230)	2
  (0, 250)	1
  (0, 264)	3
  (0, 313)	1
  (0, 358)	1
  (0, 422)	3
  (0, 424)	1
  (0, 438)	1
  (0, 480)	2
  (0, 488)	1
  (0, 905)	1
  (0, 907)	1
  (0, 934)	2
  (0, 990)	1
  (0, 1135)	3
  (0, 1240)	1
  (0, 1289)	1
  (0, 1497)	1
  (0, 1559)	1
  (0, 1621)	1
  (0, 1814)	1
  (0, 1956)	1
  (0, 2003)	1
  :	:
  (0, 2662)	1
  (0, 2685)	1
  (0, 2812)	2
  (0, 2841)	3
  (0, 2864)	1
  (0, 3166)	2
  (0, 3168)	1
  (0, 3246)	1
  (0, 3415)	1
  (0, 3473)	1
  (0, 3497)	1
  (0, 3703)	2
  (0, 3734)	1
  (0, 3759)	1
  (0, 3763)	1
  (0, 4133)	6
  (0, 4134)	1
  (0, 4213)	10
  (0, 4250)	1
  (0, 4454)	7
  (0, 4456)	1
  (0, 4527)	1
  (0, 4621)	1
  (0, 4636)	1
  (0, 4650)	1


Selected feature differ from the binary case, as now they have to be informative with respect to a different set of labels.

In [20]:
print(vect.inverse_transform(sel.inverse_transform(X_train_sel[0,:])))

[array(['activities', 'additional', 'also', 'an', 'and', 'appearance',
       'as', 'authentication', 'authenticity', 'average', 'based', 'be',
       'compute', 'computer', 'confidence', 'context', 'data',
       'determine', 'disclosed', 'engine', 'example', 'face', 'gait',
       'heuristic', 'how', 'in', 'include', 'is', 'made', 'may', 'method',
       'monitoring', 'more', 'movements', 'on', 'or', 'other',
       'predictive', 'preemptively', 'programmed', 'receives',
       'regarding', 'relevant', 'score', 'security', 'sensitive',
       'sensors', 'system', 'systems', 'the', 'thus', 'user', 'users',
       'vision', 'when', 'will', 'with'], dtype='<U23')]


## Weighting

In [21]:
tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec = tfidf.transform(X_train_sel)
X_test_vec =tfidf.transform(X_test_sel)

In [22]:
print(X_train_vec[0,:])

  (0, 4650)	0.034272298568714096
  (0, 4636)	0.09441563442789391
  (0, 4621)	0.06222415493203917
  (0, 4527)	0.12091226230577407
  (0, 4456)	0.08109826660544252
  (0, 4454)	0.34343175641297063
  (0, 4250)	0.10444083330306983
  (0, 4213)	0.2098293130559417
  (0, 4134)	0.0583023914838006
  (0, 4133)	0.22569164369898215
  (0, 3763)	0.10112786007181591
  (0, 3759)	0.1133982148558292
  (0, 3734)	0.06417650906055922
  (0, 3703)	0.1845377564700082
  (0, 3497)	0.09815259091710517
  (0, 3473)	0.10838838490452433
  (0, 3415)	0.08428770084936733
  (0, 3246)	0.12556817813384408
  (0, 3168)	0.17856473617256763
  (0, 3166)	0.25444401455767124
  (0, 2864)	0.06025679643819927
  (0, 2841)	0.11318451484390893
  (0, 2812)	0.06373517689641844
  (0, 2685)	0.1471623486233739
  (0, 2662)	0.05262046952507982
  :	:
  (0, 2003)	0.11968098028110531
  (0, 1956)	0.15067277433141268
  (0, 1814)	0.16922589454070208
  (0, 1621)	0.09870197780775031
  (0, 1559)	0.08125720457436793
  (0, 1497)	0.09539358743868143
  (0, 

In [23]:
for feat,weight in zip(vect.inverse_transform(sel.inverse_transform(X_train_vec[0,:]))[0],X_train_vec[0,:].data):
  print(feat,weight)

activities 0.034272298568714096
additional 0.09441563442789391
also 0.06222415493203917
an 0.12091226230577407
and 0.08109826660544252
appearance 0.34343175641297063
as 0.10444083330306983
authentication 0.2098293130559417
authenticity 0.0583023914838006
average 0.22569164369898215
based 0.10112786007181591
be 0.1133982148558292
compute 0.06417650906055922
computer 0.1845377564700082
confidence 0.09815259091710517
context 0.10838838490452433
data 0.08428770084936733
determine 0.12556817813384408
disclosed 0.17856473617256763
engine 0.25444401455767124
example 0.06025679643819927
face 0.11318451484390893
gait 0.06373517689641844
heuristic 0.1471623486233739
how 0.05262046952507982
in 0.07234580545317308
include 0.03557971313596843
is 0.22413141851956067
made 0.09217845430916606
may 0.08278330533398488
method 0.061691253819134
monitoring 0.026788405829879864
more 0.11968098028110531
movements 0.15067277433141268
on 0.16922589454070208
or 0.09870197780775031
other 0.08125720457436793
pred

## Learning algorithm

Linear SVM implement multi-class single-label using a one-vs-rest approach

In [24]:
learner = LinearSVC()  # linear svm with default parameters
classifier = learner.fit(X_train_vec,y_train)
predictions = classifier.predict(X_test_vec)

In [25]:
len(predictions)

7187

In [26]:
predictions

array(['H04', 'NA', 'G06', ..., 'NA', 'NA', 'NA'], dtype=object)

## Evaluation of accuracy

In [27]:
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

0.6847085014609712


## Using sklearn pipeline object

In [32]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=3000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', MLPClassifier())  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)
correct = 0
for prediction,true_label in zip(predictions, y_test):
    if prediction==true_label:
        correct += 1
print(correct/len(predictions))

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

The classification score for the binary classifier we learned earlier is different, though it is trained on exactly the same data. Why?

We try a linear svm with one-vs-one model.

LinearSVC does not implement OvO.

We can wrap it into a OneVsOneClassifier that can be applied to any classifier.

(Note that other classifiers natively implement OvO, e.g., sklearn.svm.SVC)

In [29]:


pipeline = Pipeline([
    ('vect', TfidfVectorizer()),  # feature extraction
    ('sel', SelectKBest(chi2, k=3000)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', OneVsOneClassifier( MLPClassifier(random_state=1, max_iter=300)))  # learning algorithm
])

classifier = pipeline.fit(x_train,y_train)
predictions = classifier.predict(x_test)



In [30]:
from sklearn.metrics import confusion_matrix, classification_report
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

         G06       0.66      0.66      0.66      2845
         H04       0.62      0.54      0.58      1622
          NA       0.66      0.71      0.68      2067

    accuracy                           0.65      6534
   macro avg       0.65      0.64      0.64      6534
weighted avg       0.65      0.65      0.65      6534

Confusion matrix:
[[1891  443  511]
 [ 487  883  252]
 [ 494  101 1472]]


In [18]:
parameters = {'n_neighbors': np.arange(1, 10 + 1), 'weights':['uniform', 'distance']}

search = RandomizedSearchCV(KNeighborsClassifier(), parameters,cv = 10, scoring = 'f1_weighted')
search.fit(x_train, y_train)
report(search.cv_results_, n_top=3)
clf = search.best_estimator_
clf

NameError: name 'RandomizedSearchCV' is not defined

# BERT MODEL

In [28]:
import tensorflow

from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel, TFRobertaForSequenceClassification, RobertaTokenizer
from transformers import TFDistilBertModel, DistilBertTokenizer, TFDistilBertForSequenceClassification, ElectraTokenizer, TFElectraForSequenceClassification


ImportError: cannot import name 'TFBertForSequenceClassification' from 'transformers' (unknown location)

In [27]:
tokenizer = DistilBertTokenizer.from_pretrained(model_name) # tokenizer init

def build_model():
    with strategy.scope():
        
        bert_encoder = TFDistilBertForSequenceClassification.from_pretrained(model_name)
  
        
        # define tensors for inputs
        input_word_ids = tf.keras.Input(shape=(None,), dtype=tf.int32, name="input_word_ids")
        input_mask = tf.keras.Input(shape=(None,), dtype=tf.int32, name="input_mask")
        
        # Define model for fine-tuning Bert
        
        ### Embedding layer extracted from pretrained BERT
        embedding = bert_encoder([input_word_ids, input_mask])[0]
        
        ### Layers for Classification task
        output_layer = tf.keras.layers.Dropout(0.2)(embedding)
        output_dense_layer = tf.keras.layers.Dense(64, activation='relu')(output_layer)
        output_dense_layer = tf.keras.layers.Dropout(0.1)(output_dense_layer)
        output_dense_layer = tf.keras.layers.Dense(32, activation='relu')(output_dense_layer)
        output = tf.keras.layers.Dense(3, activation='softmax')(output_dense_layer)

        # Define Training parameters
        ## Optimizer is ADAM
        ## Function Loss is CrossEntropy
        ## Metric for evaluation is a standard accuracy
        model = tf.keras.Model(inputs=[input_word_ids, input_mask], outputs=output)
        model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        return model

# Init DeepLearning Model 
with strategy.scope():
    model = build_model()
    model.summary() # this describe model architecture and layers

NameError: name 'DistilBertTokenizer' is not defined

In [None]:
auto = tf.data.experimental.AUTOTUNE

def make_dataset(train_input, train_label):
    dataset = tf.data.Dataset.from_tensor_slices(
        (
            train_input,
            train_label
        )
    ).repeat().shuffle(batch_size).batch(batch_size).prefetch(auto)
    return dataset


def bert_encode(hypotheses, premises, augmentation=False):
    num_examples = len(hypotheses)

    # sentence_1 = [tokenizer.encode(s) for s in premises]
    # sentence_2 = [tokenizer.encode(s) for s in hypotheses]
    input_word_ids = [tokenizer.encode(s1,s2) for s1,s2 in zip(premises,hypotheses)  ]
    # input_word_ids = list(map(lambda x: x[0]+x[1], list(zip(sentence_1,sentence_2))))
    input_mask = [np.ones_like(x) for x in input_word_ids]
    inputs = {
        'input_word_ids': tf.keras.preprocessing.sequence.pad_sequences(input_word_ids, padding='post', maxlen=MAX_LEN, truncating='post'),
        'input_mask': tf.keras.preprocessing.sequence.pad_sequences(input_mask, padding='post', maxlen=MAX_LEN, truncating='post')
    }
    return inputs

In [None]:
df_train['prediction'] = 0
num_augmentation = 1

# encoding training data
train_input = bert_encode(train_df.hypothesis.values,train_df.premise.values, augmentation=False)
train_label = train_df.label.values

# create data Iterator for training 
train_sequence = make_dataset(train_input, train_label)

# encoding validation data
validation_input = bert_encode(val_df.hypothesis.values, val_df.premise.values, augmentation=False)
validation_label = val_df.label.values
tf.keras.backend.clear_session()

## Model training

In [None]:
n_steps = (len(train_label)) // batch_size

with strategy.scope():
    history = model.fit(
        train_sequence, shuffle=True, steps_per_epoch=n_steps, 
        validation_data = (validation_input, validation_label), epochs=50, verbose=1,
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10),
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=5),
            tf.keras.callbacks.ModelCheckpoint(
                'model.h5', monitor='val_accuracy', save_best_only=True,save_weights_only=True)
        ]
    ) 

# save trained model
model.load_weights('model.h5')

In [None]:
# calcul of validation Accuracy
validation_predictions = model.predict(validation_input)
validation_predictions = np.argmax(validation_predictions, axis=-1)
val_df['predictions'] = validation_predictions
acc = accuracy_score(validation_label, validation_predictions)
print('Accuracy: {}'.format(acc))