# feature extraction and embeddings

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [14]:
df = pd.read_csv('./spooky.csv')

In [15]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [16]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [17]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [18]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [19]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [20]:
y_train.value_counts()

author_encoded
0    5530
2    4231
1    3944
Name: count, dtype: int64

In [21]:
y_test.value_counts()

author_encoded
0    2370
2    1813
1    1691
Name: count, dtype: int64

In [22]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
2    2.333701
1    2.332348
Name: count, dtype: float64

## G. vectorisation (embeddings de mots)

fastText

In [23]:
fasttext_model = FastText(sentences=x_train, vector_size=100, window=5, min_count=1, workers=4)

In [24]:
def apply_fasttest_model(sentence):
    words = sentence.split()
    word_vectors = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(fasttext_model.vector_size)

x_train_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_train])
x_test_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_test])

In [25]:
x_train_fasttext

array([[-2.6414480e-02, -5.6211394e-03,  6.2599243e-03, ...,
         9.3179084e-03, -3.0198617e-02, -1.7895570e-02],
       [-1.5246296e-02,  4.2545707e-03, -9.6049653e-03, ...,
         9.0960839e-06, -4.4681113e-03, -3.6076030e-03],
       [-3.5161825e-03, -9.1566071e-03,  1.5348463e-02, ...,
         6.9344011e-03, -1.8878652e-02, -1.1185233e-02],
       ...,
       [ 1.0763680e-03,  1.0093573e-03,  5.3692912e-04, ...,
        -1.0471374e-03, -1.1869870e-03, -3.0778430e-04],
       [-3.3312928e-04,  2.1707702e-03,  1.3398016e-03, ...,
        -3.2499383e-04,  5.9631170e-04,  1.0990292e-03],
       [-3.0516753e-02,  7.6204189e-03, -1.8517317e-02, ...,
         2.5697768e-04, -9.7685754e-03, -6.6769491e-03]], dtype=float32)

In [26]:
x_train_fasttext.shape

(13705, 100)

## H. entrainement/test

result

In [27]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=20, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [28]:
print("fasttext results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

fasttext results:


Train Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.84      0.56      5530
           1       0.74      0.01      0.02      3944
           2       0.42      0.25      0.31      4231

    accuracy                           0.42     13705
   macro avg       0.52      0.37      0.30     13705
weighted avg       0.51      0.42      0.33     13705

Test Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.82      0.55      2370
           1       0.56      0.01      0.01      1691
           2       0.41      0.24      0.30      1813

    accuracy                           0.41      5874
   macro avg       0.46      0.36      0.29      5874
weighted avg       0.45      0.41      0.32      5874

Prediction time: 0.00486 seconds




### CBOW with FastText

In [29]:
fasttext_model = FastText(sentences=x_train, vector_size=100, window=5, min_count=1, workers=4, sg=0)

In [30]:
x_train_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_train])
x_test_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_test])

In [31]:
x_train_fasttext

array([[-0.00013085,  0.0023482 , -0.02081653, ...,  0.00089923,
        -0.03118732, -0.03685814],
       [-0.0023779 , -0.00047567, -0.00875388, ..., -0.00084579,
        -0.00827542, -0.01572775],
       [ 0.00307296,  0.00284655, -0.00779148, ...,  0.00103728,
        -0.01527457, -0.01265171],
       ...,
       [ 0.00107637,  0.00100936,  0.00053693, ..., -0.00104714,
        -0.00118699, -0.00030778],
       [-0.00033313,  0.00217077,  0.0013398 , ..., -0.00032499,
         0.00059631,  0.00109903],
       [-0.0052749 , -0.00165814, -0.01684788, ..., -0.00141991,
        -0.01723676, -0.03045109]], dtype=float32)

In [32]:
x_train_fasttext.shape

(13705, 100)

In [33]:
print("fasttext results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

fasttext results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.84      0.56      5530
           1       0.64      0.00      0.00      3944
           2       0.42      0.25      0.31      4231

    accuracy                           0.42     13705
   macro avg       0.49      0.36      0.29     13705
weighted avg       0.48      0.42      0.32     13705

Test Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.82      0.55      2370
           1       0.50      0.00      0.00      1691
           2       0.40      0.24      0.30      1813

    accuracy                           0.41      5874
   macro avg       0.44      0.35      0.28      5874
weighted avg       0.43      0.41      0.31      5874

Prediction time: 0.00361 seconds


