# feature extraction and embeddings

In [33]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import time

## A. preparation de donnees

In [34]:
df = pd.read_csv('./spooky.csv')

In [35]:
df.head()

Unnamed: 0,id,text,author,tokens_space,tokens_rule,tokens_subword,entities,pos_tags,lemmatized,stemmed,great_count,impossible_count,sentiment
0,id26305,process however afforded means ascertaining di...,EAP,"['process', 'however', 'afforded', 'means', 'a...","['process', 'however', 'afforded', 'means', 'a...","['process', 'however', 'afforded', 'means', 'a...",[],"[('process', 'NOUN'), ('however', 'ADV'), ('af...",process however afforded mean ascertaining dim...,process howev afford mean ascertain dimens dun...,0,0,-0.5216
1,id17569,never occurred fumbling might mere mistake,HPL,"['never', 'occurred', 'fumbling', 'might', 'me...","['never', 'occurred', 'fumbling', 'might', 'me...","['never', 'occurred', 'fu', '##mbling', 'might...",[],"[('never', 'ADV'), ('occurred', 'VERB'), ('fum...",never occurred fumbling might mere mistake,never occur fumbl might mere mistak,0,0,-0.34
2,id11008,left hand gold snuff box capered hill cutting ...,EAP,"['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['left', 'hand', 'gold', 'snuff', 'box', 'cape...","['left', 'hand', 'gold', 's', '##nu', '##ff', ...","[('gold snuff', 'ORG'), ('hill cutting manner ...","[('left', 'VERB'), ('hand', 'NOUN'), ('gold', ...",left hand gold snuff box capered hill cutting ...,left hand gold snuff box caper hill cut manner...,1,0,0.9246
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS,"['lovely', 'spring', 'looked', 'windsor', 'ter...","['lovely', 'spring', 'looked', 'windsor', 'ter...","['lovely', 'spring', 'looked', 'windsor', 'ter...","[('sixteen', 'CARDINAL'), ('years', 'DATE')]","[('lovely', 'ADJ'), ('spring', 'NOUN'), ('look...",lovely spring looked windsor terrace sixteen f...,love spring look windsor terrac sixteen fertil...,0,0,0.946
4,id12958,finding nothing else even gold superintendent ...,HPL,"['finding', 'nothing', 'else', 'even', 'gold',...","['finding', 'nothing', 'else', 'even', 'gold',...","['finding', 'nothing', 'else', 'even', 'gold',...",[],"[('finding', 'VERB'), ('nothing', 'PRON'), ('e...",finding nothing else even gold superintendent ...,find noth els even gold superintend abandon at...,0,0,-0.8078


In [36]:
df = df[['id', 'text', 'author', 'tokens_subword']]

In [37]:
df.head()

Unnamed: 0,id,text,author,tokens_subword
0,id26305,process however afforded means ascertaining di...,EAP,"['process', 'however', 'afforded', 'means', 'a..."
1,id17569,never occurred fumbling might mere mistake,HPL,"['never', 'occurred', 'fu', '##mbling', 'might..."
2,id11008,left hand gold snuff box capered hill cutting ...,EAP,"['left', 'hand', 'gold', 's', '##nu', '##ff', ..."
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS,"['lovely', 'spring', 'looked', 'windsor', 'ter..."
4,id12958,finding nothing else even gold superintendent ...,HPL,"['finding', 'nothing', 'else', 'even', 'gold',..."


In [38]:
df['text'].isnull().sum()

3

In [39]:
df = df.dropna(subset=['text'])

In [40]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [41]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [42]:
df.head()

Unnamed: 0,id,text,author,tokens_subword,author_encoded
0,id26305,process however afforded means ascertaining di...,EAP,"['process', 'however', 'afforded', 'means', 'a...",0
1,id17569,never occurred fumbling might mere mistake,HPL,"['never', 'occurred', 'fu', '##mbling', 'might...",1
2,id11008,left hand gold snuff box capered hill cutting ...,EAP,"['left', 'hand', 'gold', 's', '##nu', '##ff', ...",0
3,id27763,lovely spring looked windsor terrace sixteen f...,MWS,"['lovely', 'spring', 'looked', 'windsor', 'ter...",2
4,id12958,finding nothing else even gold superintendent ...,HPL,"['finding', 'nothing', 'else', 'even', 'gold',...",1


## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [43]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'].values)

obtenir une repartition similaire dans chaque classe du dataset

In [44]:
y_train.value_counts()

author_encoded
0    5529
2    4230
1    3944
Name: count, dtype: int64

In [45]:
y_test.value_counts()

author_encoded
0    2369
2    1813
1    1691
Name: count, dtype: int64

In [46]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333896
2    2.333149
1    2.332348
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset (frequence lexicale / one-hot encoding) (bag of words)

In [47]:
# binary false for frequence lexicale, binary true for one-hot encoding
count_vectorizer = CountVectorizer(binary=False,analyzer= 'word', stop_words='english')

In [48]:
x_train_cv = count_vectorizer.fit_transform(x_train)
x_test_cv = count_vectorizer.transform(x_test)

TF-IDF

In [49]:
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

## E. entrainement

cree 3 models du type MLP classifier

In [50]:
mlp_count = MLPClassifier(random_state=0)
mlp_tfidf = MLPClassifier(random_state=0)
mlp_onehot = MLPClassifier(random_state=0)

entrainement

In [51]:
mlp_count.fit(x_train_cv, y_train)
mlp_tfidf.fit(x_train_tfidf, y_train)
mlp_onehot.fit(x_train_cv, y_train) 

prediction (training)

In [52]:
y_train_pred_count = mlp_count.predict(x_train_cv)
y_train_pred_tfidf = mlp_tfidf.predict(x_train_tfidf)
y_train_pred_onehot = mlp_onehot.predict(x_train_cv)

rapport de classification

In [53]:
print("count vectorizer model:")
print(classification_report(y_train, y_train_pred_count))

count vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5529
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4230

    accuracy                           1.00     13703
   macro avg       1.00      1.00      1.00     13703
weighted avg       1.00      1.00      1.00     13703



In [54]:
print("tfidf vectorizer model:")
print(classification_report(y_train, y_train_pred_tfidf))

tfidf vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5529
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4230

    accuracy                           1.00     13703
   macro avg       1.00      1.00      1.00     13703
weighted avg       1.00      1.00      1.00     13703



In [55]:
print("one-hot encoding model:")
print(classification_report(y_train, y_train_pred_onehot))

one-hot encoding model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5529
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4230

    accuracy                           1.00     13703
   macro avg       1.00      1.00      1.00     13703
weighted avg       1.00      1.00      1.00     13703



## F. test

prediction (test)

In [56]:
y_test_pred_count = mlp_count.predict(x_test_cv)
y_test_pred_tfidf = mlp_tfidf.predict(x_test_tfidf)
y_test_pred_onehot = mlp_onehot.predict(x_test_cv)

rapport de classification

In [57]:
print("count vectorizer model (test):")
print(classification_report(y_test, y_test_pred_count))

count vectorizer model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2369
           1       0.77      0.71      0.74      1691
           2       0.71      0.76      0.73      1813

    accuracy                           0.74      5873
   macro avg       0.74      0.74      0.74      5873
weighted avg       0.74      0.74      0.74      5873



In [58]:
print("tfidf vectorizer model (test):")
print(classification_report(y_test, y_test_pred_tfidf))

tfidf vectorizer model (test):
              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2369
           1       0.80      0.77      0.79      1691
           2       0.76      0.79      0.77      1813

    accuracy                           0.78      5873
   macro avg       0.78      0.78      0.78      5873
weighted avg       0.78      0.78      0.78      5873



In [59]:
print("one-hot encoding model (test):")
print(classification_report(y_test, y_test_pred_onehot))

one-hot encoding model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2369
           1       0.77      0.71      0.74      1691
           2       0.71      0.76      0.73      1813

    accuracy                           0.74      5873
   macro avg       0.74      0.74      0.74      5873
weighted avg       0.74      0.74      0.74      5873



temps de prediction

In [60]:
start_time = time.time()
mlp_count.predict(x_test_cv)
print(f"count vectorizer prediction time: {time.time() - start_time} seconds")


count vectorizer prediction time: 0.02947688102722168 seconds


In [61]:
start_time = time.time()
mlp_tfidf.predict(x_test_tfidf)
print(f"tfidf vectorizer prediction time: {time.time() - start_time} seconds")

tfidf vectorizer prediction time: 0.029552936553955078 seconds


In [62]:
start_time = time.time()
mlp_onehot.predict(x_test_cv)
print(f"One-Hot Encoding Prediction Time: {time.time() - start_time} seconds")

One-Hot Encoding Prediction Time: 0.013795137405395508 seconds


## G. vectorisation (embeddings de mots)

word2vec

In [63]:
tokenized_text = [sentence.split() for sentence in x_train]

word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

x_train_word2vec = [vectorize_text(text, word2vec_model) for text in tokenized_text]

glove

fastText

In [64]:
fasttext_model = FastText(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

x_train_fasttext = [vectorize_text(text, fasttext_model) for text in tokenized_text]

## H. entrainement/test

In [65]:
def get_average_vector(text, model):
    words = word_tokenize(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [66]:
x_train_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_train])
x_test_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_test])

In [67]:
x_train_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_train])
x_test_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_test])

result

In [68]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [69]:
print("Word2Vec Results:")
train_and_evaluate(x_train_word2vec, x_test_word2vec, y_train, y_test)

Word2Vec Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      5529
           1       0.00      0.00      0.00      3944
           2       0.00      0.00      0.00      4230

    accuracy                           0.40     13703
   macro avg       0.13      0.33      0.19     13703
weighted avg       0.16      0.40      0.23     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.00      0.00      0.00      1813

    accuracy                           0.40      5873
   macro avg       0.13      0.33      0.19      5873
weighted avg       0.16      0.40      0.23      5873

Prediction time: 0.00778 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
print("FastText Results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

FastText Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      5529
           1       0.67      0.00      0.00      3944
           2       0.33      0.02      0.03      4230

    accuracy                           0.40     13703
   macro avg       0.47      0.33      0.20     13703
weighted avg       0.46      0.40      0.24     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.29      0.02      0.03      1813

    accuracy                           0.40      5873
   macro avg       0.23      0.33      0.20      5873
weighted avg       0.25      0.40      0.24      5873

Prediction time: 0.01157 seconds
