# feature extraction and embeddings

In [26]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time


## A. preparation de donnees

In [27]:
df = pd.read_csv('./spooky.csv')

In [28]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [29]:
df.shape

(19579, 3)

In [30]:
df = df[: 100]

In [31]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [32]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [33]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1










## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [34]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [35]:
y_train.value_counts()

author_encoded
0    28
2    22
1    20
Name: count, dtype: int64

In [36]:
y_test.value_counts()

author_encoded
0    12
1     9
2     9
Name: count, dtype: int64

In [37]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
1    2.222222
2    2.444444
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset

## G. vectorisation (embeddings de mots)

In [38]:
EPOCHS = 50
WINDOW_SIZE = 2
EMBEDDING_DIM = 5
LEARNING_RATE = 0.01

In [39]:
x_train.head()

23    with how deep a spirit of wonder and perplexit...
28    our compases depth gauges and other delicate i...
85    ahead lay sparse grass and scrub blueberry bus...
48    dr johnson as i beheld him was a ful pursy man...
73    the next morning i delivered my letters of int...
Name: text, dtype: object

In [40]:
words = [word for sentence in x_train for word in sentence.split()]
words

['with',
 'how',
 'deep',
 'a',
 'spirit',
 'of',
 'wonder',
 'and',
 'perplexity',
 'was',
 'i',
 'wont',
 'to',
 'regard',
 'him',
 'from',
 'our',
 'remote',
 'pew',
 'in',
 'the',
 'galery',
 'as',
 'with',
 'step',
 'solemn',
 'and',
 'slow',
 'he',
 'ascended',
 'the',
 'pulpit',
 'this',
 'reverend',
 'man',
 'with',
 'countenance',
 'so',
 'demurely',
 'benign',
 'with',
 'robes',
 'so',
 'glossy',
 'and',
 'so',
 'clericaly',
 'flowing',
 'with',
 'wig',
 'so',
 'minutely',
 'powdered',
 'so',
 'rigid',
 'and',
 'so',
 'vast',
 'could',
 'this',
 'be',
 'he',
 'who',
 'of',
 'late',
 'with',
 'sour',
 'visage',
 'and',
 'in',
 'snufy',
 'habiliments',
 'administered',
 'ferule',
 'in',
 'hand',
 'the',
 'draconian',
 'laws',
 'of',
 'the',
 'academy',
 'our',
 'compases',
 'depth',
 'gauges',
 'and',
 'other',
 'delicate',
 'instruments',
 'were',
 'ruined',
 'so',
 'that',
 'henceforth',
 'our',
 'only',
 'reckoning',
 'would',
 'be',
 'gueswork',
 'based',
 'on',
 'our',
 'w

In [41]:
vocab = list(set(words))
vocab

['power',
 'rested',
 'brutality',
 'progress',
 'preternatural',
 'proportions',
 'mobile',
 'expenses',
 'dilation',
 'speculation',
 'murdered',
 'awhile',
 'mewed',
 'silent',
 'dr',
 'somewhat',
 'while',
 'monday',
 'through',
 'magnus',
 'structure',
 'even',
 'ful',
 'attractions',
 'kep',
 'back',
 'ultimate',
 'cud',
 'upward',
 'services',
 'delirium',
 'with',
 'sick',
 'advancement',
 'beyond',
 'watches',
 'my',
 'more',
 'heart',
 'results',
 'fal',
 'sufficiently',
 'mentioned',
 'all',
 'care',
 'ply',
 'enterprise',
 'necesary',
 'disapeared',
 'but',
 'though',
 'studies',
 'what',
 'unrelenting',
 'raising',
 'profesion',
 'gods',
 'wonder',
 'for',
 'dreaded',
 'thoughtless',
 'inconvenient',
 'cel',
 'barricaded',
 'intervals',
 'beheld',
 'rigid',
 'getting',
 'von',
 'want',
 'gave',
 'down',
 'interest',
 'time',
 'native',
 'fanciful',
 'fatality',
 'still',
 'inteligence',
 'thy',
 'mud',
 'singularly',
 'awful',
 'either',
 'like',
 'imagination',
 'inopport

In [42]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx

{'power': 0,
 'rested': 1,
 'brutality': 2,
 'progress': 3,
 'preternatural': 4,
 'proportions': 5,
 'mobile': 6,
 'expenses': 7,
 'dilation': 8,
 'speculation': 9,
 'murdered': 10,
 'awhile': 11,
 'mewed': 12,
 'silent': 13,
 'dr': 14,
 'somewhat': 15,
 'while': 16,
 'monday': 17,
 'through': 18,
 'magnus': 19,
 'structure': 20,
 'even': 21,
 'ful': 22,
 'attractions': 23,
 'kep': 24,
 'back': 25,
 'ultimate': 26,
 'cud': 27,
 'upward': 28,
 'services': 29,
 'delirium': 30,
 'with': 31,
 'sick': 32,
 'advancement': 33,
 'beyond': 34,
 'watches': 35,
 'my': 36,
 'more': 37,
 'heart': 38,
 'results': 39,
 'fal': 40,
 'sufficiently': 41,
 'mentioned': 42,
 'all': 43,
 'care': 44,
 'ply': 45,
 'enterprise': 46,
 'necesary': 47,
 'disapeared': 48,
 'but': 49,
 'though': 50,
 'studies': 51,
 'what': 52,
 'unrelenting': 53,
 'raising': 54,
 'profesion': 55,
 'gods': 56,
 'wonder': 57,
 'for': 58,
 'dreaded': 59,
 'thoughtless': 60,
 'inconvenient': 61,
 'cel': 62,
 'barricaded': 63,
 'interv

In [43]:
idx2word = {i: word for word, i in word2idx.items()}
idx2word

{0: 'power',
 1: 'rested',
 2: 'brutality',
 3: 'progress',
 4: 'preternatural',
 5: 'proportions',
 6: 'mobile',
 7: 'expenses',
 8: 'dilation',
 9: 'speculation',
 10: 'murdered',
 11: 'awhile',
 12: 'mewed',
 13: 'silent',
 14: 'dr',
 15: 'somewhat',
 16: 'while',
 17: 'monday',
 18: 'through',
 19: 'magnus',
 20: 'structure',
 21: 'even',
 22: 'ful',
 23: 'attractions',
 24: 'kep',
 25: 'back',
 26: 'ultimate',
 27: 'cud',
 28: 'upward',
 29: 'services',
 30: 'delirium',
 31: 'with',
 32: 'sick',
 33: 'advancement',
 34: 'beyond',
 35: 'watches',
 36: 'my',
 37: 'more',
 38: 'heart',
 39: 'results',
 40: 'fal',
 41: 'sufficiently',
 42: 'mentioned',
 43: 'all',
 44: 'care',
 45: 'ply',
 46: 'enterprise',
 47: 'necesary',
 48: 'disapeared',
 49: 'but',
 50: 'though',
 51: 'studies',
 52: 'what',
 53: 'unrelenting',
 54: 'raising',
 55: 'profesion',
 56: 'gods',
 57: 'wonder',
 58: 'for',
 59: 'dreaded',
 60: 'thoughtless',
 61: 'inconvenient',
 62: 'cel',
 63: 'barricaded',
 64: 'in

In [44]:
vocab_size = len(word2idx)
vocab_size

1043

In [45]:
def generate_training_data(corpus, window_size):
    training_data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i, target in enumerate(tokens):
            context_start = max(0, i - window_size)
            context_end = min(len(tokens), i + window_size + 1)
            for j in range(context_start, context_end):
                if i != j:
                    training_data.append((target, tokens[j]))
    return training_data

training_pairs = generate_training_data(x_train, WINDOW_SIZE)
training_pairs

[('with', 'how'),
 ('with', 'deep'),
 ('how', 'with'),
 ('how', 'deep'),
 ('how', 'a'),
 ('deep', 'with'),
 ('deep', 'how'),
 ('deep', 'a'),
 ('deep', 'spirit'),
 ('a', 'how'),
 ('a', 'deep'),
 ('a', 'spirit'),
 ('a', 'of'),
 ('spirit', 'deep'),
 ('spirit', 'a'),
 ('spirit', 'of'),
 ('spirit', 'wonder'),
 ('of', 'a'),
 ('of', 'spirit'),
 ('of', 'wonder'),
 ('of', 'and'),
 ('wonder', 'spirit'),
 ('wonder', 'of'),
 ('wonder', 'and'),
 ('wonder', 'perplexity'),
 ('and', 'of'),
 ('and', 'wonder'),
 ('and', 'perplexity'),
 ('and', 'was'),
 ('perplexity', 'wonder'),
 ('perplexity', 'and'),
 ('perplexity', 'was'),
 ('perplexity', 'i'),
 ('was', 'and'),
 ('was', 'perplexity'),
 ('was', 'i'),
 ('was', 'wont'),
 ('i', 'perplexity'),
 ('i', 'was'),
 ('i', 'wont'),
 ('i', 'to'),
 ('wont', 'was'),
 ('wont', 'i'),
 ('wont', 'to'),
 ('wont', 'regard'),
 ('to', 'i'),
 ('to', 'wont'),
 ('to', 'regard'),
 ('to', 'him'),
 ('regard', 'wont'),
 ('regard', 'to'),
 ('regard', 'him'),
 ('regard', 'from'),
 ('

In [46]:
def one_hot_encode(word, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

In [47]:
w1 = np.random.randn(vocab_size, EMBEDDING_DIM) 
w2 = np.random.randn(EMBEDDING_DIM, vocab_size) 

w1, w2

(array([[-0.50073774,  1.20131646, -0.68844731,  0.06297113, -1.11054822],
        [-0.66371088,  0.14292769, -0.0981082 , -0.22779068, -0.81904009],
        [ 0.01679371, -0.64819968,  0.32137565,  1.38429412,  0.20873892],
        ...,
        [ 1.08977831,  0.5233666 , -1.16549739, -1.11562754, -1.15359575],
        [ 0.42345754,  0.14461652,  0.30713379, -0.37037827, -0.00871301],
        [ 1.33339214,  0.5238869 ,  0.08676185, -0.53353049, -0.1026827 ]]),
 array([[ 0.31915603,  1.28886103, -1.27376526, ..., -1.04617344,
         -0.26995764, -0.40301769],
        [ 0.97934037,  1.07380224,  0.16442658, ...,  0.41718364,
         -0.23961746, -0.7538376 ],
        [ 0.79766276, -1.02709297, -1.0032722 , ...,  0.6263151 ,
         -1.05797448, -0.23748977],
        [-0.33224564,  0.00204144,  1.51353129, ..., -1.27541052,
          1.47410184,  0.83167807],
        [-0.22961347, -1.22638678,  0.39570988, ..., -1.54646573,
         -0.52314531, -0.36297415]]))

In [51]:
for epoch in range(EPOCHS):
    total_loss = 0
    for target_word, context_word in training_pairs:

        # forward
        target_vec = one_hot_encode(target_word, vocab_size)
        hidden_layer = np.dot(w1.T, target_vec) 
        output_layer = np.dot(w2.T, hidden_layer) 
        # softmax
        predictions = np.exp(output_layer) / np.sum(np.exp(output_layer)) 

        # loss
        target_label = one_hot_encode(context_word, vocab_size)
        loss = -np.sum(target_label * np.log(predictions + 1e-9))
        total_loss += loss

        # backward
        error = predictions - target_label
        w2 -= LEARNING_RATE * np.outer(hidden_layer, error)
        w1 -= LEARNING_RATE * np.outer(target_vec, np.dot(w2, error))
    
    print(f"epoch {epoch}, loss: {loss:.4f}, total loss: {total_loss:.4f}")

epoch 0, loss: 7.2467, total loss: 48844.9946
epoch 1, loss: 7.0820, total loss: 48753.1845
epoch 2, loss: 6.9140, total loss: 48663.8961
epoch 3, loss: 6.7459, total loss: 48575.5800
epoch 4, loss: 6.5792, total loss: 48488.4422
epoch 5, loss: 6.4145, total loss: 48402.5376
epoch 6, loss: 6.2523, total loss: 48317.8593
epoch 7, loss: 6.0929, total loss: 48234.3771
epoch 8, loss: 5.9367, total loss: 48152.0517
epoch 9, loss: 5.7837, total loss: 48070.8418
epoch 10, loss: 5.6344, total loss: 47990.7068
epoch 11, loss: 5.4890, total loss: 47911.6080
epoch 12, loss: 5.3477, total loss: 47833.5091
epoch 13, loss: 5.2108, total loss: 47756.3768
epoch 14, loss: 5.0785, total loss: 47680.1802
epoch 15, loss: 4.9510, total loss: 47604.8913
epoch 16, loss: 4.8286, total loss: 47530.4842
epoch 17, loss: 4.7113, total loss: 47456.9353
epoch 18, loss: 4.5994, total loss: 47384.2229
epoch 19, loss: 4.4929, total loss: 47312.3270
epoch 20, loss: 4.3917, total loss: 47241.2291
epoch 21, loss: 4.2960,

In [53]:
for word, embedding in word_embeddings.items():
    print(f"{word}: {embedding}")

power: [ 0.32483899 -0.12430056  0.98913195 -1.16183501  0.94775199]
rested: [ 0.39954275 -1.05373093  0.55086123 -1.30165859 -0.86006355]
brutality: [ 1.20896433 -1.36662211  0.11730195  1.29583003 -0.65991874]
progress: [ 1.15259932  0.03055835  0.66679905 -0.95988845 -0.29029925]
preternatural: [-0.92399203 -0.35479523  1.74298249 -0.62741268 -0.01012132]
proportions: [ 1.05884225 -0.2222698  -0.66687522  0.57371363  0.79437188]
mobile: [ 0.32912229  0.64619763  0.07279556 -1.19983729  1.08486807]
expenses: [ 0.40598303 -0.56785892 -0.24410401 -0.63814528 -1.69380212]
dilation: [ 0.95813023  0.94816904 -0.16153191 -0.4708492  -0.42486715]
speculation: [ 0.73194058 -0.32059302 -0.30277544  0.36464766  1.94391402]
murdered: [ 1.58185978  1.14035944 -0.61416506  0.11768939 -1.03828246]
awhile: [ 0.53958857 -0.30200095 -0.00683438 -0.44350088  0.44851124]
mewed: [ 0.92684711  0.67694733 -1.25380611 -1.25526999  0.7760778 ]
silent: [ 0.3128532  -0.14144373 -0.57367076 -0.56436963  1.4996

In [58]:
def get_sentence_embedding(sentence, w1, word2idx):
    words = sentence.split()
    vectors = [w1[word2idx[word]] for word in words if word in word2idx]
    if len(vectors) == 0:
        return np.zeros(w1.shape[1]) 
    return np.mean(vectors, axis=0) 

x_train_skip_gram = np.array([get_sentence_embedding(sentence, w1, word2idx) for sentence in x_train])
x_test_skip_gram = np.array([get_sentence_embedding(sentence, w1, word2idx) for sentence in x_test])

## H. entrainement/test

result

In [54]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(20,), max_iter=20, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("train classification report:")
    print(classification_report(y_train, y_train_pred))
    print("test classification report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"prediction time: {time.time() - start_time:.5f} seconds")

In [59]:
print("skip-gram results:")
train_and_evaluate(x_train_skip_gram, x_test_skip_gram, y_train, y_test)

skip-gram results:
train classification report:
              precision    recall  f1-score   support

           0       0.49      0.75      0.59        28
           1       0.42      0.25      0.31        20
           2       0.60      0.41      0.49        22

    accuracy                           0.50        70
   macro avg       0.50      0.47      0.46        70
weighted avg       0.50      0.50      0.48        70

test classification report:
              precision    recall  f1-score   support

           0       0.48      1.00      0.65        12
           1       0.50      0.11      0.18         9
           2       0.67      0.22      0.33         9

    accuracy                           0.50        30
   macro avg       0.55      0.44      0.39        30
weighted avg       0.54      0.50      0.41        30

prediction time: 0.00051 seconds


