# Evaluation based on SVM with BOW and TF-IDF

### Loading the train, validation and test sets

In [1]:
from pathlib import Path
import pandas as pd

corpus = pd.read_csv(Path('../dataset/oversampling_filtered_ds_remove_discrepancies.csv'))
corpus = corpus.sample(frac=1, random_state=42)
df_train = corpus[corpus['split'] == 'train']
df_dev = corpus[corpus['split'] == 'dev']
df_test = corpus[corpus['split'] == 'test']
len(df_train)

652

### Loading the model. Tokenizer implementation.

In [2]:
import spacy 
import string
from spacy.lang.en import English
from spacy.lang.it import Italian
from langdetect import detect

nlp_en = English()
nlp_it = Italian()

en_stopwords = nlp_en.Defaults.stop_words
it_stopwords = nlp_it.Defaults.stop_words

nlp_models = {
        'it' : spacy.load("it_core_news_sm", disable = ['parser', 'ner']),
        'en' : spacy.load('en_core_web_sm', disable=['parser','ner'])
}

punctuations = string.punctuation + '...¡¿'
stop_words = en_stopwords.union(it_stopwords)

def spacy_tokenizer(sentence):
    lang = detect(sentence)
    nlp = nlp_models.get(lang, nlp_models["it"])
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    return mytokens

  from .autonotebook import tqdm as notebook_tqdm


### Fitting Bag of Words and TF-IDF on the training set

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer = 'word', tokenizer=spacy_tokenizer)
vectorizer = vectorizer.fit(df_train['Sentence'].to_list())



### Transform on the training and test sets

In [4]:

def dataset_preprocessing(dataset):    
    dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))

In [5]:
dataset_preprocessing(df_train)
dataset_preprocessing(df_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['lemm_sentence'] = dataset['Sentence'].apply(lambda x: vectorizer.transform([x]))


In [6]:
df_test

Unnamed: 0,Sentence,labels,split,lemm_sentence
751,llimento del tentativo di assicurare una magg...,none,test,"(0, 3849)\t0.15299748747447564\n (0, 3706)\..."
761,il tempo della politica. Lo condividiamo del t...,migrant,test,"(0, 3761)\t0.2220478192763538\n (0, 3594)\t..."
721,"on 251, length 8) L'obiettivo prioritario è p...",migrant,test,"(0, 3806)\t0.14592164264707763\n (0, 3743)\..."
717,lia le cose non sono poi così positive. Molti...,migrant,test,"(0, 3940)\t0.1309402345446797\n (0, 3817)\t..."
731,"di giovedì e venerdì prossimi, non può conti...",migrant,test,"(0, 3937)\t0.20411893886522464\n (0, 3911)\..."
757,",000 through our own Conflict, Stability and S...",none,test,"(0, 3900)\t0.11762448828508729\n (0, 3841)\..."
718,calation in Ethiopia and humanitarian access? ...,none,test,"(0, 4065)\t0.15654037697500697\n (0, 4064)\..."
722,"lla bozza di conclusioni si parla di rigore, d...",none,test,"(0, 3988)\t0.2538176150454172\n (0, 3980)\t..."
713,ella fame o delle disuguaglianze sociali: non...,none,test,"(0, 3840)\t0.16558980929507539\n (0, 3719)\..."
715,ion health charge up front and face the finan...,none,test,"(0, 4033)\t0.11643544750872217\n (0, 3741)\..."


In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Integer Encoding
encoder = LabelEncoder()
df_train['labels'] = encoder.fit_transform(df_train['labels'])
df_test['labels'] = encoder.transform(df_test['labels'])
df_dev['labels'] = encoder.transform(df_dev['labels'])
print(df_train['labels'])

357    0
259    0
193    1
333    0
586    0
      ..
71     0
106    1
270    1
435    1
102    1
Name: labels, Length: 652, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['labels'] = encoder.fit_transform(df_train['labels'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['labels'] = encoder.transform(df_test['labels'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_dev['labels'] = encoder.transform(df_dev['labels'])


### SVM algorithm training

In [8]:
from sklearn.svm import SVC
from scipy.sparse import vstack

X_train = vstack(df_train['lemm_sentence'])
Y_train = df_train['labels']

svr = SVC(kernel='rbf', C=100)
svr = svr.fit(X_train, Y_train)

### Predictions on the test set

In [9]:
X_test =  vstack(df_test.lemm_sentence)

In [10]:
predictions = svr.predict(X_test)

In [11]:
predictions

array([1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
Y_test = df_test['labels']

In [13]:
Y_test

751    1
761    0
721    0
717    0
731    0
757    1
718    1
722    1
713    1
715    1
760    1
711    1
743    1
724    0
745    1
716    1
742    1
748    1
759    1
728    0
733    1
710    1
763    1
749    1
756    1
732    1
755    1
764    1
725    0
746    0
736    1
754    1
712    0
752    0
734    1
723    0
750    1
740    1
735    0
737    1
739    1
738    1
720    0
729    1
708    1
709    1
730    1
727    0
741    1
726    1
753    1
719    0
744    0
762    1
714    1
758    1
747    1
Name: labels, dtype: int64

### Evaluation metrics

In [14]:
from sklearn.metrics import f1_score

f1_score(Y_test, predictions)

0.8695652173913043