# feature extraction and embeddings

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [3]:
df = pd.read_csv('./spooky.csv')

In [4]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [5]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [6]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [7]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [8]:
# x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])



splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)

for train_idx, test_idx in splitter.split(df['text'], df['author_encoded']):
    x_train, x_test = df['text'].iloc[train_idx], df['text'].iloc[test_idx]
    y_train, y_test = df['author_encoded'].iloc[train_idx], df['author_encoded'].iloc[test_idx]

obtenir une repartition similaire dans chaque classe du dataset

In [9]:
y_train.value_counts()

author_encoded
0    5530
2    4231
1    3944
Name: count, dtype: int64

In [10]:
y_test.value_counts()

author_encoded
0    2370
2    1813
1    1691
Name: count, dtype: int64

In [11]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
2    2.333701
1    2.332348
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset

In [12]:
# binary false for frequence lexicale, binary true for one-hot encoding
count_vectorizer = CountVectorizer(binary=False,analyzer= 'word', stop_words='english')

In [13]:
x_train_cv = count_vectorizer.fit_transform(x_train)
x_test_cv = count_vectorizer.transform(x_test)

In [14]:
x_train_cv.shape, x_test_cv.shape

((13705, 23082), (5874, 23082))

In [15]:
x_train_cv = pd.DataFrame(data = x_train_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_train_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
x_test_cv = pd.DataFrame(data = x_test_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_test_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


TF-IDF

In [17]:
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [18]:
x_train_tfidf = pd.DataFrame(data = x_train_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_train_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
x_test_tfidf = pd.DataFrame(data = x_test_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_test_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## E. entrainement

cree 3 models du type MLP classifier

In [20]:
mlp_count =  MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

mlp_tfidf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

mlp_onehot = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

entrainement

In [21]:
mlp_count.fit(x_train_cv, y_train)
mlp_tfidf.fit(x_train_tfidf, y_train)
mlp_onehot.fit(x_train_cv, y_train) 

prediction (training)

In [22]:
y_train_pred_count = mlp_count.predict(x_train_cv)
y_train_pred_tfidf = mlp_tfidf.predict(x_train_tfidf)
y_train_pred_onehot = mlp_onehot.predict(x_train_cv)

rapport de classification

In [23]:
def model__prediction(title, y_train, y_train_pred):
    print(title)
    print(classification_report(y_train, y_train_pred))

In [24]:
model__prediction('count vectorizer model:', y_train, y_train_pred_count)

count vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [25]:
model__prediction('tf-idf vectorizer model:', y_train, y_train_pred_tfidf)

tf-idf vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [26]:
model__prediction('one-hot encoding model:', y_train, y_train_pred_onehot)

one-hot encoding model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



## F. test

prediction (test)

In [27]:
y_test_pred_count = mlp_count.predict(x_test_cv)
y_test_pred_tfidf = mlp_tfidf.predict(x_test_tfidf)
y_test_pred_onehot = mlp_onehot.predict(x_test_cv)

rapport de classification

In [28]:
model__prediction('count vectorizer model (test):', y_test, y_test_pred_count)

count vectorizer model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2370
           1       0.77      0.71      0.74      1691
           2       0.70      0.76      0.73      1813

    accuracy                           0.74      5874
   macro avg       0.74      0.73      0.74      5874
weighted avg       0.74      0.74      0.74      5874



In [29]:
model__prediction('tf-idf vectorizer model (test):', y_test, y_test_pred_tfidf)

tf-idf vectorizer model (test):
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      2370
           1       0.83      0.77      0.80      1691
           2       0.79      0.79      0.79      1813

    accuracy                           0.80      5874
   macro avg       0.80      0.80      0.80      5874
weighted avg       0.80      0.80      0.80      5874



In [30]:
model__prediction('one-hot encoding model (test):', y_test, y_test_pred_onehot)

one-hot encoding model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2370
           1       0.77      0.71      0.74      1691
           2       0.70      0.76      0.73      1813

    accuracy                           0.74      5874
   macro avg       0.74      0.73      0.74      5874
weighted avg       0.74      0.74      0.74      5874



temps de prediction

In [31]:
start_time = time.time()
mlp_count.predict(x_test_cv)
print(f"count vectorizer prediction time: {time.time() - start_time} seconds")


count vectorizer prediction time: 2.545112133026123 seconds


In [32]:
start_time = time.time()
mlp_tfidf.predict(x_test_tfidf)
print(f"tfidf vectorizer prediction time: {time.time() - start_time} seconds")

tfidf vectorizer prediction time: 1.8861758708953857 seconds


In [33]:
start_time = time.time()
mlp_onehot.predict(x_test_cv)
print(f"One-Hot Encoding Prediction Time: {time.time() - start_time} seconds")

One-Hot Encoding Prediction Time: 1.6840486526489258 seconds
