# feature extraction and embeddings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import gensim.downloader as api
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [2]:
df = pd.read_csv('./spooky.csv')

In [3]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [4]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [5]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [6]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

In [17]:
x_train

8446     upon these two words therefore i have mainly b...
15332    i had the good fortune to recollect that in th...
4486     each one of a gang so placed is not so much gr...
19359         if he were vanquished i should be a free man
6440     among a multitude of opinions upon this delica...
                               ...                        
7389     at these times he would shew a sardonic humour...
15422    made sign talk as soon as they got over bein s...
2581      there was no discoloration in the cellular tisue
4597                        his forehead was broad and low
1951     as i perused this i felt myself growing gradua...
Name: text, Length: 13705, dtype: object

## G. vectorisation (embeddings de mots)

glove

In [9]:
words = [word for sentence in x_train for word in sentence.split()]
words

['upon',
 'these',
 'two',
 'words',
 'therefore',
 'i',
 'have',
 'mainly',
 'built',
 'my',
 'hopes',
 'of',
 'a',
 'full',
 'solution',
 'of',
 'the',
 'ridle',
 'i',
 'had',
 'the',
 'good',
 'fortune',
 'to',
 'recollect',
 'that',
 'in',
 'the',
 'accentuation',
 'of',
 'this',
 'drama',
 'or',
 'at',
 'least',
 'of',
 'such',
 'portion',
 'of',
 'it',
 'as',
 'is',
 'allotted',
 'to',
 'the',
 'hero',
 'the',
 'tones',
 'of',
 'voice',
 'in',
 'which',
 'i',
 'found',
 'myself',
 'deficient',
 'were',
 'altogether',
 'unecesary',
 'and',
 'the',
 'deep',
 'guttural',
 'was',
 'expected',
 'to',
 'reign',
 'monotonously',
 'throughout',
 'each',
 'one',
 'of',
 'a',
 'gang',
 'so',
 'placed',
 'is',
 'not',
 'so',
 'much',
 'greedy',
 'of',
 'reward',
 'or',
 'anxious',
 'for',
 'escape',
 'as',
 'fearful',
 'of',
 'betrayal',
 'if',
 'he',
 'were',
 'vanquished',
 'i',
 'should',
 'be',
 'a',
 'free',
 'man',
 'among',
 'a',
 'multitude',
 'of',
 'opinions',
 'upon',
 'this',
 '

In [10]:
vocab = list(set(words))
vocab

['tho',
 'skimmed',
 'inroads',
 'extinguishing',
 'connection',
 'lodger',
 'dais',
 'pan',
 'alien',
 'aspects',
 'punctured',
 'incapable',
 'hu',
 'woes',
 'adept',
 'scarce',
 'elaborate',
 'chanels',
 'character',
 'broader',
 'bodied',
 'baiae',
 'sek',
 'tranquilitatis',
 'immediate',
 'puerile',
 'floor',
 'entitled',
 'acidents',
 'depositions',
 'seven',
 'professors',
 'gets',
 'maintains',
 'ratiocination',
 'bred',
 'sepulchres',
 'tunnels',
 'introduction',
 'classifying',
 'thirties',
 'frxwn',
 'lightsome',
 'teling',
 'scraps',
 'fabricate',
 'nxr',
 'windowles',
 'marvel',
 'fancied',
 'drum',
 'paranoiac',
 'genlman',
 'makes',
 'sulphuric',
 'ways',
 'pul',
 'moribund',
 'trencher',
 'triple',
 'meanderings',
 'vul',
 'sospiri',
 'scientifically',
 'sternly',
 'bison',
 'hoisted',
 'props',
 'homogeneity',
 'fetching',
 'paroxysm',
 'barometers',
 'recommend',
 'snufy',
 'garden',
 'tropical',
 'setled',
 'estimating',
 'lapped',
 'pane',
 'camphor',
 'buries',
 'c

In [11]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx

{'tho': 0,
 'skimmed': 1,
 'inroads': 2,
 'extinguishing': 3,
 'connection': 4,
 'lodger': 5,
 'dais': 6,
 'pan': 7,
 'alien': 8,
 'aspects': 9,
 'punctured': 10,
 'incapable': 11,
 'hu': 12,
 'woes': 13,
 'adept': 14,
 'scarce': 15,
 'elaborate': 16,
 'chanels': 17,
 'character': 18,
 'broader': 19,
 'bodied': 20,
 'baiae': 21,
 'sek': 22,
 'tranquilitatis': 23,
 'immediate': 24,
 'puerile': 25,
 'floor': 26,
 'entitled': 27,
 'acidents': 28,
 'depositions': 29,
 'seven': 30,
 'professors': 31,
 'gets': 32,
 'maintains': 33,
 'ratiocination': 34,
 'bred': 35,
 'sepulchres': 36,
 'tunnels': 37,
 'introduction': 38,
 'classifying': 39,
 'thirties': 40,
 'frxwn': 41,
 'lightsome': 42,
 'teling': 43,
 'scraps': 44,
 'fabricate': 45,
 'nxr': 46,
 'windowles': 47,
 'marvel': 48,
 'fancied': 49,
 'drum': 50,
 'paranoiac': 51,
 'genlman': 52,
 'makes': 53,
 'sulphuric': 54,
 'ways': 55,
 'pul': 56,
 'moribund': 57,
 'trencher': 58,
 'triple': 59,
 'meanderings': 60,
 'vul': 61,
 'sospiri': 62

In [12]:
idx2word = {i: word for word, i in word2idx.items()}
idx2word

{0: 'tho',
 1: 'skimmed',
 2: 'inroads',
 3: 'extinguishing',
 4: 'connection',
 5: 'lodger',
 6: 'dais',
 7: 'pan',
 8: 'alien',
 9: 'aspects',
 10: 'punctured',
 11: 'incapable',
 12: 'hu',
 13: 'woes',
 14: 'adept',
 15: 'scarce',
 16: 'elaborate',
 17: 'chanels',
 18: 'character',
 19: 'broader',
 20: 'bodied',
 21: 'baiae',
 22: 'sek',
 23: 'tranquilitatis',
 24: 'immediate',
 25: 'puerile',
 26: 'floor',
 27: 'entitled',
 28: 'acidents',
 29: 'depositions',
 30: 'seven',
 31: 'professors',
 32: 'gets',
 33: 'maintains',
 34: 'ratiocination',
 35: 'bred',
 36: 'sepulchres',
 37: 'tunnels',
 38: 'introduction',
 39: 'classifying',
 40: 'thirties',
 41: 'frxwn',
 42: 'lightsome',
 43: 'teling',
 44: 'scraps',
 45: 'fabricate',
 46: 'nxr',
 47: 'windowles',
 48: 'marvel',
 49: 'fancied',
 50: 'drum',
 51: 'paranoiac',
 52: 'genlman',
 53: 'makes',
 54: 'sulphuric',
 55: 'ways',
 56: 'pul',
 57: 'moribund',
 58: 'trencher',
 59: 'triple',
 60: 'meanderings',
 61: 'vul',
 62: 'sospiri'

In [13]:
vocab_size = len(word2idx)
vocab_size

23413

In [18]:
curpos = [sentence.split() for sentence in x_train]
curpos

[['upon',
  'these',
  'two',
  'words',
  'therefore',
  'i',
  'have',
  'mainly',
  'built',
  'my',
  'hopes',
  'of',
  'a',
  'full',
  'solution',
  'of',
  'the',
  'ridle'],
 ['i',
  'had',
  'the',
  'good',
  'fortune',
  'to',
  'recollect',
  'that',
  'in',
  'the',
  'accentuation',
  'of',
  'this',
  'drama',
  'or',
  'at',
  'least',
  'of',
  'such',
  'portion',
  'of',
  'it',
  'as',
  'is',
  'allotted',
  'to',
  'the',
  'hero',
  'the',
  'tones',
  'of',
  'voice',
  'in',
  'which',
  'i',
  'found',
  'myself',
  'deficient',
  'were',
  'altogether',
  'unecesary',
  'and',
  'the',
  'deep',
  'guttural',
  'was',
  'expected',
  'to',
  'reign',
  'monotonously',
  'throughout'],
 ['each',
  'one',
  'of',
  'a',
  'gang',
  'so',
  'placed',
  'is',
  'not',
  'so',
  'much',
  'greedy',
  'of',
  'reward',
  'or',
  'anxious',
  'for',
  'escape',
  'as',
  'fearful',
  'of',
  'betrayal'],
 ['if', 'he', 'were', 'vanquished', 'i', 'should', 'be', 'a',

co-occurance matrix

In [None]:

# Example text
text = """
I love natural language processing and I love deep learning.
Natural language processing is fascinating.
Deep learning is a subset of machine learning.
"""

# Tokenize into sentences and words
sentences = [word_tokenize(sentence.lower()) for sentence in sent_tokenize(text)]
print("Tokenized Sentences:\n", sentences)

Co-occurrence Matrix:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


fastText

## H. entrainement/test

In [None]:
def get_average_vector(text, model):
    words = word_tokenize(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
x_train_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_train])
x_test_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_test])

In [None]:
x_train_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_train])
x_test_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_test])

result

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [None]:
print("Word2Vec Results:")
train_and_evaluate(x_train_word2vec, x_test_word2vec, y_train, y_test)

Word2Vec Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      5529
           1       0.00      0.00      0.00      3944
           2       0.00      0.00      0.00      4230

    accuracy                           0.40     13703
   macro avg       0.13      0.33      0.19     13703
weighted avg       0.16      0.40      0.23     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.00      0.00      0.00      1813

    accuracy                           0.40      5873
   macro avg       0.13      0.33      0.19      5873
weighted avg       0.16      0.40      0.23      5873

Prediction time: 0.00778 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("FastText Results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

FastText Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      5529
           1       0.67      0.00      0.00      3944
           2       0.33      0.02      0.03      4230

    accuracy                           0.40     13703
   macro avg       0.47      0.33      0.20     13703
weighted avg       0.46      0.40      0.24     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.29      0.02      0.03      1813

    accuracy                           0.40      5873
   macro avg       0.23      0.33      0.20      5873
weighted avg       0.25      0.40      0.24      5873

Prediction time: 0.01157 seconds
