# Evaluation based on SVM with BOW and TF-IDF

### Loading the train, validation and test sets

In [62]:
from pathlib import Path
import pandas as pd

corpus = pd.read_csv(Path('../dataset/binary_undersampling_filtered_ds_remove_discrepancies.csv'))
corpus = corpus.sample(frac=1, random_state=42)
df_train = corpus[corpus['split'] == 'train']
df_dev = corpus[corpus['split'] == 'dev']
df_test = corpus[corpus['split'] == 'test']
print(len(df_train))
print(len(df_test))
print(len(df_dev))

508
78
77


### Loading the model. Tokenizer implementation.

In [63]:
import spacy 
import string
from spacy.lang.en import English
from spacy.lang.it import Italian
from langdetect import detect

nlp_en = English()
nlp_it = Italian()

en_stopwords = nlp_en.Defaults.stop_words
it_stopwords = nlp_it.Defaults.stop_words

nlp_models = {
        'it' : spacy.load("it_core_news_sm", disable = ['parser', 'ner']),
        'en' : spacy.load('en_core_web_sm', disable=['parser','ner'])
}

punctuations = string.punctuation + '...¡¿'
stop_words = en_stopwords.union(it_stopwords)

def spacy_tokenizer(sentence):
    lang = detect(sentence)
    nlp = nlp_models.get(lang, nlp_models["it"])
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

### Fitting Bag of Words and TF-IDF on the training set

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=spacy_tokenizer)
vectorizer = vectorizer.fit(df_train['Sentence'].to_list())



### Transform on the training and test sets

In [65]:

def dataset_preprocessing(dataset):    
    dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))

In [66]:
dataset_preprocessing(df_train)
dataset_preprocessing(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))


In [67]:
df_test

Unnamed: 0,Sentence,labels,split,lemm_sentence
587,re than six months. It also applies if they w...,yes,test,"(0, 4646)\t0.17588323941488757\n (0, 4575)\..."
608,As we work to suppress the virus with these lo...,yes,test,"(0, 4654)\t0.24372740012953562\n (0, 4574)\..."
621,Frank Sargent of the Bureau of Immigration ret...,no,test,"(0, 3615)\t0.41824149865129373\n (0, 2842)\..."
650,"In the post-Hurricane Sandy period, New York's...",yes,test,"(0, 4674)\t0.39144538573884147\n (0, 4581)\..."
593,"punti, identica a quella presentata dalla ma...",no,test,"(0, 4604)\t0.19829455554468145\n (0, 4088)\..."
...,...,...,...,...
619,",000 through our own Conflict, Stability and S...",no,test,"(0, 4685)\t0.1105366552309237\n (0, 4489)\t..."
600,di un grosso mercato illegale in Turchia di pa...,no,test,"(0, 4595)\t0.16649818975984912\n (0, 4485)\..."
657,shortfalls in the current provision. ParlaMint...,no,test,"(0, 4651)\t0.13233818649370174\n (0, 4641)\..."
647,Continuing their efforts to curb the movement ...,yes,test,"(0, 4669)\t0.17856834593776047\n (0, 3514)\..."


In [68]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Integer Encoding
encoder = LabelEncoder()
df_train['labels'] = encoder.fit_transform(df_train['labels'])
df_test['labels'] = encoder.transform(df_test['labels'])
df_dev['labels'] = encoder.transform(df_dev['labels'])
print(df_train['labels'])

327    1
362    1
265    1
436    1
450    1
      ..
71     0
106    0
270    1
435    1
102    0
Name: labels, Length: 508, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['labels'] = encoder.fit_transform(df_train['labels'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['labels'] = encoder.transform(df_test['labels'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['labels'] = encoder.transform(df_dev['labels'])


### SVM algorithm training

In [69]:
from sklearn.svm import SVC
from scipy.sparse import vstack

X_train = vstack(df_train['lemm_sentence'])
Y_train = df_train['labels']

svr = SVC(kernel='rbf', C=100)
svr = svr.fit(X_train, Y_train)

### Predictions on the test set

In [70]:
X_test =  vstack(df_test.lemm_sentence)

In [71]:
predictions = svr.predict(X_test)

In [72]:
predictions

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0])

In [73]:
Y_test = df_test['labels']

In [74]:
Y_test

587    1
608    1
621    0
650    1
593    0
      ..
619    0
600    0
657    0
647    1
614    0
Name: labels, Length: 78, dtype: int64

### Evaluation metrics

In [75]:
from sklearn.metrics import f1_score,accuracy_score

print(f'F1 Score: {f1_score(Y_test, predictions, average="macro")}')
print(f'Accuracy {accuracy_score(Y_test, predictions)}')

F1 Score: 0.7531234382808596
Accuracy 0.7564102564102564
