# train a multinominal classifier

## load and process the data with gensim

In [1]:
import numpy as np
import gensim
from sklearn.model_selection import train_test_split
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.punkt import PunktLanguageVars
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/chen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/chen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/chen/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
def preprocessing(text:str) -> str:
    pattern = re.compile(r"\w+\s*'\s*\w+") # Pattern to match ' with optional spaces before and after
    text = re.sub(pattern, lambda m: m.group().replace(" ", ""),text)
    stop_words = set(stopwords.words('french'))

    text = text.lower()

    french_tokenizer = PunktLanguageVars()
    tokens = word_tokenize(text, language='french')

    tokens = [token for token in tokens if token not in stop_words and token.isalnum()]

    return " ".join(tokens)

df1 = pd.read_csv("../disgust.csv")
df2 = pd.read_csv("../love.csv")
data = pd.concat([df1,df2], ignore_index=True)
data.head()

Unnamed: 0,excerpt,emotions
0,— Le recueil ' e ' mentdecevéhiculesefaitexcep...,disgust
1,Mycélium d ’ un champignon ( Claviceps purpure...,disgust
2,"Un chien , auquel on en fit avaler , succomba ...",disgust
3,""" La trompe d ' Eustache nous amène tout natur...",disgust
4,galop en me lançant des regards terribles .Le ...,disgust


In [6]:
data["excerpt_clean"] = data['excerpt'].apply(preprocessing)
data['emotion'] = data['emotions'].map({"love":0, "sadness":1})
data["excerpt_clean"].head()

0    recueil e mentdecevéhiculesefaitexceptionnelle...
1    mycélium champignon claviceps purpurea pousse ...
2    chien auquel fit avaler succomba bout treize j...
3    trompe amène tout naturellement affection cais...
4    galop lançant regards terribles défilé pensée ...
Name: excerpt_clean, dtype: object

In [None]:
X_train,X_test,y_train,y_test = train_test_split(data["excerpt_clean"],data["emotion"],test_size=0.2)
X_train.head()

KeyError: 'experts_clean'

## Word embedding with word2vec

In [23]:
w2v_model = gensim.models.Word2Vec(
    X_train,
    vector_size=100,
    window=5,
    min_count=2
)

vocabulary = set(w2v_model.wv.index_to_key)

In [25]:
def article2vec(expert, vocab, model):
    vectors = [model.wv[word] for word in expert if word in vocab]
    if len(vectors) > 0:
        return np.mean(vectors,axis=0) # simply taking the mean of vectors to build the vector representation of expert
    else:
        return np.zeros(model.vector_size) # if non, return a vector size length vector with 0
    
X_train_vect = np.array([article2vec(expert,vocabulary,w2v_model ) for expert in X_train])
X_test_vect = np.array([article2vec(expert,vocabulary,w2v_model ) for expert in X_test])

In [26]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf_model = rf.fit(X_train_vect, y_train.values.ravel())
y_pred = rf_model.predict(X_test_vect)

In [27]:
# evaluate
from sklearn.metrics import precision_score, recall_score

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
print('Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))


Precision: 0.444 / Recall: 0.062 / Accuracy: 0.693


In [28]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

svm = SVC(kernel="linear", class_weight="balanced") # without balanced, svm will prior majority class
svm.fit(X_train_vect,y_train)
y_pred = svm.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
# perfomrs better on bigger dataset

Accuracy: 0.35362997658079626
              precision    recall  f1-score   support

           0       1.00      0.07      0.14       298
           1       0.32      1.00      0.48       129

    accuracy                           0.35       427
   macro avg       0.66      0.54      0.31       427
weighted avg       0.79      0.35      0.24       427

