# feature extraction and embeddings

In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [2]:
df = pd.read_csv('./spooky.csv')

In [3]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [4]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [5]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [6]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [8]:
y_train.value_counts()

author_encoded
0    5530
2    4231
1    3944
Name: count, dtype: int64

In [9]:
y_test.value_counts()

author_encoded
0    2370
2    1813
1    1691
Name: count, dtype: int64

In [10]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
2    2.333701
1    2.332348
Name: count, dtype: float64

## G. vectorisation (embeddings de mots)

fastText

In [18]:
fasttext_model = FastText(sentences=curpos, vector_size=100, window=5, min_count=1, workers=4)

In [28]:
def apply_fasttest_model(sentence):
    words = sentence.split()
    word_vectors = [fasttext_model.wv[word] for word in words if word in fasttext_model.wv]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(fasttext_model.vector_size)

x_train_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_train])
x_test_fasttext = np.array([apply_fasttest_model(sentence) for sentence in x_test])

In [24]:
x_train_fasttext

array([[ 0.26527318,  0.18841845, -0.23513998, ..., -0.73363596,
         0.12699072, -0.05921701],
       [ 0.28420934,  0.32516658, -0.34486327, ..., -0.7260664 ,
         0.10820561,  0.02408916],
       [ 0.3385909 ,  0.45310947, -0.35663137, ..., -0.79677194,
         0.03257429,  0.0967418 ],
       ...,
       [ 0.3034159 ,  0.20143126, -0.22385566, ..., -0.61429125,
         0.06679798, -0.13495952],
       [ 0.22940598,  0.47807384, -0.3363528 , ..., -0.8541437 ,
         0.18164004,  0.16867167],
       [ 0.4007625 ,  0.21672821, -0.43876672, ..., -0.62190765,
         0.06015337, -0.05460074]], dtype=float32)

In [25]:
x_train_fasttext.shape

(13705, 100)

## H. entrainement/test

result

In [34]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=20, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [35]:
print("fasttext results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

fasttext results:


Train Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.67      0.57      5530
           1       0.46      0.28      0.35      3944
           2       0.51      0.46      0.49      4231

    accuracy                           0.50     13705
   macro avg       0.49      0.47      0.47     13705
weighted avg       0.49      0.50      0.48     13705

Test Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.66      0.57      2370
           1       0.46      0.28      0.35      1691
           2       0.49      0.46      0.48      1813

    accuracy                           0.49      5874
   macro avg       0.49      0.47      0.47      5874
weighted avg       0.49      0.49      0.48      5874

Prediction time: 0.01271 seconds


