# feature extraction and embeddings

In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [38]:
df = pd.read_csv('./spooky.csv')

In [39]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [40]:
df.shape

(19579, 3)

In [41]:
df = df[: 100]

In [42]:
df.shape

(100, 3)

In [15]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [16]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [17]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [18]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [19]:
y_train.value_counts()

author_encoded
0    28
2    22
1    20
Name: count, dtype: int64

In [20]:
y_test.value_counts()

author_encoded
0    12
1     9
2     9
Name: count, dtype: int64

In [21]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
1    2.222222
2    2.444444
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset

## G. vectorisation (embeddings de mots)

In [22]:
EPOCHS = 50
WINDOW_SIZE = 2
EMBEDDING_DIM = 5
LEARNING_RATE = 0.01

In [23]:
x_train.head()

23    with how deep a spirit of wonder and perplexit...
28    our compases depth gauges and other delicate i...
85    ahead lay sparse grass and scrub blueberry bus...
48    dr johnson as i beheld him was a ful pursy man...
73    the next morning i delivered my letters of int...
Name: text, dtype: object

In [24]:
words = [word for sentence in x_train for word in sentence.split()]
words

['with',
 'how',
 'deep',
 'a',
 'spirit',
 'of',
 'wonder',
 'and',
 'perplexity',
 'was',
 'i',
 'wont',
 'to',
 'regard',
 'him',
 'from',
 'our',
 'remote',
 'pew',
 'in',
 'the',
 'galery',
 'as',
 'with',
 'step',
 'solemn',
 'and',
 'slow',
 'he',
 'ascended',
 'the',
 'pulpit',
 'this',
 'reverend',
 'man',
 'with',
 'countenance',
 'so',
 'demurely',
 'benign',
 'with',
 'robes',
 'so',
 'glossy',
 'and',
 'so',
 'clericaly',
 'flowing',
 'with',
 'wig',
 'so',
 'minutely',
 'powdered',
 'so',
 'rigid',
 'and',
 'so',
 'vast',
 'could',
 'this',
 'be',
 'he',
 'who',
 'of',
 'late',
 'with',
 'sour',
 'visage',
 'and',
 'in',
 'snufy',
 'habiliments',
 'administered',
 'ferule',
 'in',
 'hand',
 'the',
 'draconian',
 'laws',
 'of',
 'the',
 'academy',
 'our',
 'compases',
 'depth',
 'gauges',
 'and',
 'other',
 'delicate',
 'instruments',
 'were',
 'ruined',
 'so',
 'that',
 'henceforth',
 'our',
 'only',
 'reckoning',
 'would',
 'be',
 'gueswork',
 'based',
 'on',
 'our',
 'w

In [25]:
vocab = list(set(words))
vocab

['slovenly',
 'atempts',
 'calamity',
 'gold',
 'little',
 'afore',
 'quality',
 'shield',
 'car',
 'committed',
 'tribe',
 'present',
 'keep',
 'endless',
 'draperies',
 'spy',
 'impunity',
 'would',
 'flowing',
 'rem',
 'fairly',
 'chance',
 'glen',
 'out',
 'met',
 'let',
 'time',
 'noble',
 'amidst',
 'provided',
 'genial',
 'have',
 'sputer',
 'we',
 'warriors',
 'breath',
 'mouth',
 'knew',
 'before',
 'courage',
 'better',
 'conning',
 'moved',
 'certainly',
 'verge',
 'near',
 'finding',
 'bursts',
 'make',
 'nose',
 'obedience',
 'kep',
 'changes',
 'paleness',
 'dyin',
 'he',
 'wiliam',
 'states',
 'murdered',
 'kind',
 'without',
 'extravagant',
 'blueberry',
 'boats',
 'parts',
 'box',
 'often',
 'magazine',
 'beauty',
 'each',
 'ultimate',
 'birth',
 'not',
 'helples',
 'facial',
 'water',
 'galopin',
 'state',
 'wig',
 'gentle',
 'daown',
 'gently',
 'friendship',
 'moments',
 'wars',
 'exist',
 'cold',
 'california',
 'get',
 'must',
 'neither',
 'evening',
 'youth',
 'a

In [26]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx

{'slovenly': 0,
 'atempts': 1,
 'calamity': 2,
 'gold': 3,
 'little': 4,
 'afore': 5,
 'quality': 6,
 'shield': 7,
 'car': 8,
 'committed': 9,
 'tribe': 10,
 'present': 11,
 'keep': 12,
 'endless': 13,
 'draperies': 14,
 'spy': 15,
 'impunity': 16,
 'would': 17,
 'flowing': 18,
 'rem': 19,
 'fairly': 20,
 'chance': 21,
 'glen': 22,
 'out': 23,
 'met': 24,
 'let': 25,
 'time': 26,
 'noble': 27,
 'amidst': 28,
 'provided': 29,
 'genial': 30,
 'have': 31,
 'sputer': 32,
 'we': 33,
 'warriors': 34,
 'breath': 35,
 'mouth': 36,
 'knew': 37,
 'before': 38,
 'courage': 39,
 'better': 40,
 'conning': 41,
 'moved': 42,
 'certainly': 43,
 'verge': 44,
 'near': 45,
 'finding': 46,
 'bursts': 47,
 'make': 48,
 'nose': 49,
 'obedience': 50,
 'kep': 51,
 'changes': 52,
 'paleness': 53,
 'dyin': 54,
 'he': 55,
 'wiliam': 56,
 'states': 57,
 'murdered': 58,
 'kind': 59,
 'without': 60,
 'extravagant': 61,
 'blueberry': 62,
 'boats': 63,
 'parts': 64,
 'box': 65,
 'often': 66,
 'magazine': 67,
 'beauty

In [27]:
idx2word = {i: word for word, i in word2idx.items()}
idx2word

{0: 'slovenly',
 1: 'atempts',
 2: 'calamity',
 3: 'gold',
 4: 'little',
 5: 'afore',
 6: 'quality',
 7: 'shield',
 8: 'car',
 9: 'committed',
 10: 'tribe',
 11: 'present',
 12: 'keep',
 13: 'endless',
 14: 'draperies',
 15: 'spy',
 16: 'impunity',
 17: 'would',
 18: 'flowing',
 19: 'rem',
 20: 'fairly',
 21: 'chance',
 22: 'glen',
 23: 'out',
 24: 'met',
 25: 'let',
 26: 'time',
 27: 'noble',
 28: 'amidst',
 29: 'provided',
 30: 'genial',
 31: 'have',
 32: 'sputer',
 33: 'we',
 34: 'warriors',
 35: 'breath',
 36: 'mouth',
 37: 'knew',
 38: 'before',
 39: 'courage',
 40: 'better',
 41: 'conning',
 42: 'moved',
 43: 'certainly',
 44: 'verge',
 45: 'near',
 46: 'finding',
 47: 'bursts',
 48: 'make',
 49: 'nose',
 50: 'obedience',
 51: 'kep',
 52: 'changes',
 53: 'paleness',
 54: 'dyin',
 55: 'he',
 56: 'wiliam',
 57: 'states',
 58: 'murdered',
 59: 'kind',
 60: 'without',
 61: 'extravagant',
 62: 'blueberry',
 63: 'boats',
 64: 'parts',
 65: 'box',
 66: 'often',
 67: 'magazine',
 68: 'be

In [28]:
vocab_size = len(word2idx)
vocab_size

1043

In [29]:
def generate_training_data(corpus, window_size):
    training_data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i, target in enumerate(tokens):
            context_start = max(0, i - window_size)
            context_end = min(len(tokens), i + window_size + 1)
            for j in range(context_start, context_end):
                if i != j:
                    training_data.append((target, tokens[j]))
    return training_data

training_pairs = generate_training_data(x_train, WINDOW_SIZE)
training_pairs

[('with', 'how'),
 ('with', 'deep'),
 ('how', 'with'),
 ('how', 'deep'),
 ('how', 'a'),
 ('deep', 'with'),
 ('deep', 'how'),
 ('deep', 'a'),
 ('deep', 'spirit'),
 ('a', 'how'),
 ('a', 'deep'),
 ('a', 'spirit'),
 ('a', 'of'),
 ('spirit', 'deep'),
 ('spirit', 'a'),
 ('spirit', 'of'),
 ('spirit', 'wonder'),
 ('of', 'a'),
 ('of', 'spirit'),
 ('of', 'wonder'),
 ('of', 'and'),
 ('wonder', 'spirit'),
 ('wonder', 'of'),
 ('wonder', 'and'),
 ('wonder', 'perplexity'),
 ('and', 'of'),
 ('and', 'wonder'),
 ('and', 'perplexity'),
 ('and', 'was'),
 ('perplexity', 'wonder'),
 ('perplexity', 'and'),
 ('perplexity', 'was'),
 ('perplexity', 'i'),
 ('was', 'and'),
 ('was', 'perplexity'),
 ('was', 'i'),
 ('was', 'wont'),
 ('i', 'perplexity'),
 ('i', 'was'),
 ('i', 'wont'),
 ('i', 'to'),
 ('wont', 'was'),
 ('wont', 'i'),
 ('wont', 'to'),
 ('wont', 'regard'),
 ('to', 'i'),
 ('to', 'wont'),
 ('to', 'regard'),
 ('to', 'him'),
 ('regard', 'wont'),
 ('regard', 'to'),
 ('regard', 'him'),
 ('regard', 'from'),
 ('

In [30]:
def one_hot_encode(word, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

In [31]:
w1 = np.random.randn(vocab_size, EMBEDDING_DIM) 
w2 = np.random.randn(EMBEDDING_DIM, vocab_size) 

w1, w2

(array([[ 0.57808348, -0.44941274, -0.59547342, -0.20834122,  0.43269667],
        [ 0.0731849 ,  0.74941475,  0.31552233, -0.11652877,  2.91690762],
        [ 1.15535482, -0.25508097,  0.59011547,  0.94506131, -0.25941199],
        ...,
        [-0.46458744,  0.61611883,  1.10354216, -0.45376386,  1.49453995],
        [ 0.52960461,  1.25314752,  0.44206835,  1.14947591,  0.7322041 ],
        [-2.01788606,  0.75038341, -0.80644126, -0.47293836, -0.06147192]]),
 array([[-0.31828883,  1.46334329, -0.40074627, ..., -0.59590921,
         -0.04209981,  0.07130813],
        [-0.28119764, -1.49093918, -0.57386887, ..., -1.81554218,
         -0.0175976 , -2.65323574],
        [-0.47656384, -0.19985388,  1.31032252, ...,  0.60935749,
         -1.14373644,  0.14179412],
        [-0.33546129, -0.8382469 ,  0.10759369, ..., -0.66683825,
          1.02116027, -0.63798518],
        [ 1.63685262,  0.5169095 ,  0.61919581, ..., -0.27081419,
          0.26120988, -1.39715806]]))

In [32]:
for epoch in range(EPOCHS):
    total_loss = 0
    for target_word, context_word in training_pairs:

        # forward
        target_vec = one_hot_encode(target_word, vocab_size)
        hidden_layer = np.dot(w1.T, target_vec) 
        output_layer = np.dot(w2.T, hidden_layer) 
        # softmax
        predictions = np.exp(output_layer) / np.sum(np.exp(output_layer)) 

        # loss
        target_label = one_hot_encode(context_word, vocab_size)
        loss = -np.sum(target_label * np.log(predictions + 1e-9))
        total_loss += loss

        # backward
        error = predictions - target_label
        w2 -= LEARNING_RATE * np.outer(hidden_layer, error)
        w1 -= LEARNING_RATE * np.outer(target_vec, np.dot(w2, error))
    
    print(f"epoch {epoch}, loss: {loss:.4f}, total loss: {total_loss:.4f}")

epoch 0, loss: 7.1959, total loss: 75490.4016
epoch 1, loss: 6.6823, total loss: 70284.4320
epoch 2, loss: 6.0274, total loss: 68071.3250
epoch 3, loss: 5.2186, total loss: 66463.3203
epoch 4, loss: 4.2679, total loss: 65088.9365
epoch 5, loss: 3.2715, total loss: 63807.7872
epoch 6, loss: 2.4597, total loss: 62578.2193
epoch 7, loss: 1.9973, total loss: 61418.1136
epoch 8, loss: 1.8015, total loss: 60345.3803
epoch 9, loss: 1.7337, total loss: 59359.4786
epoch 10, loss: 1.7130, total loss: 58456.3707
epoch 11, loss: 1.7071, total loss: 57631.0681
epoch 12, loss: 1.7077, total loss: 56876.1846
epoch 13, loss: 1.7133, total loss: 56185.2149
epoch 14, loss: 1.7229, total loss: 55553.8284
epoch 15, loss: 1.7357, total loss: 54978.3114
epoch 16, loss: 1.7513, total loss: 54454.5977
epoch 17, loss: 1.7694, total loss: 53978.3258
epoch 18, loss: 1.7897, total loss: 53545.2147
epoch 19, loss: 1.8116, total loss: 53151.3055
epoch 20, loss: 1.8346, total loss: 52792.9669
epoch 21, loss: 1.8582,

In [34]:
def get_sentence_embedding(sentence, w1, word2idx):
    words = sentence.split()
    vectors = [w1[word2idx[word]] for word in words if word in word2idx]
    if len(vectors) == 0:
        return np.zeros(w1.shape[1]) 
    return np.mean(vectors, axis=0) 

x_train_skip_gram = np.array([get_sentence_embedding(sentence, w1, word2idx) for sentence in x_train])
x_test_skip_gram = np.array([get_sentence_embedding(sentence, w1, word2idx) for sentence in x_test])

## H. entrainement/test

result

In [35]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=20, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("train classification report:")
    print(classification_report(y_train, y_train_pred))
    print("test classification report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"prediction time: {time.time() - start_time:.5f} seconds")

In [36]:
print("skip-gram results:")
train_and_evaluate(x_train_skip_gram, x_test_skip_gram, y_train, y_test)

skip-gram results:
train classification report:
              precision    recall  f1-score   support

           0       0.38      0.29      0.33        28
           1       0.25      0.20      0.22        20
           2       0.21      0.32      0.25        22

    accuracy                           0.27        70
   macro avg       0.28      0.27      0.27        70
weighted avg       0.29      0.27      0.27        70

test classification report:
              precision    recall  f1-score   support

           0       0.57      0.33      0.42        12
           1       0.30      0.33      0.32         9
           2       0.38      0.56      0.45         9

    accuracy                           0.40        30
   macro avg       0.42      0.41      0.40        30
weighted avg       0.43      0.40      0.40        30

prediction time: 0.00046 seconds


