# feature extraction and embeddings

In [43]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [44]:
df = pd.read_csv('./spooky.csv')

In [45]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [46]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [51]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [52]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1


## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [7]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [8]:
y_train.value_counts()

author_encoded
0    5530
2    4231
1    3944
Name: count, dtype: int64

In [9]:
y_test.value_counts()

author_encoded
0    2370
2    1813
1    1691
Name: count, dtype: int64

In [10]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
2    2.333701
1    2.332348
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset

In [11]:
# binary false for frequence lexicale, binary true for one-hot encoding
count_vectorizer = CountVectorizer(binary=False,analyzer= 'word', stop_words='english')

In [12]:
x_train_cv = count_vectorizer.fit_transform(x_train)
x_test_cv = count_vectorizer.transform(x_test)

In [13]:
x_train_cv.shape, x_test_cv.shape

((13705, 23082), (5874, 23082))

In [14]:
x_train_cv = pd.DataFrame(data = x_train_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_train_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
x_test_cv = pd.DataFrame(data = x_test_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_test_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


TF-IDF

In [16]:
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [17]:
x_train_tfidf = pd.DataFrame(data = x_train_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_train_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
x_test_tfidf = pd.DataFrame(data = x_test_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_test_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## E. entrainement

cree 3 models du type MLP classifier

In [19]:
mlp_count =  MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

mlp_tfidf = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

mlp_onehot = MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

entrainement

In [20]:
mlp_count.fit(x_train_cv, y_train)
mlp_tfidf.fit(x_train_tfidf, y_train)
mlp_onehot.fit(x_train_cv, y_train) 

prediction (training)

In [21]:
y_train_pred_count = mlp_count.predict(x_train_cv)
y_train_pred_tfidf = mlp_tfidf.predict(x_train_tfidf)
y_train_pred_onehot = mlp_onehot.predict(x_train_cv)

rapport de classification

In [22]:
def model__prediction(title, y_train, y_train_pred):
    print(title)
    print(classification_report(y_train, y_train_pred))

In [23]:
model__prediction('count vectorizer model:', y_train, y_train_pred_count)

count vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [24]:
model__prediction('tf-idf vectorizer model:', y_train, y_train_pred_tfidf)

tf-idf vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [25]:
model__prediction('one-hot encoding model:', y_train, y_train_pred_onehot)

one-hot encoding model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



## F. test

prediction (test)

In [26]:
y_test_pred_count = mlp_count.predict(x_test_cv)
y_test_pred_tfidf = mlp_tfidf.predict(x_test_tfidf)
y_test_pred_onehot = mlp_onehot.predict(x_test_cv)

rapport de classification

In [27]:
model__prediction('count vectorizer model (test):', y_test, y_test_pred_count)

count vectorizer model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2370
           1       0.77      0.71      0.74      1691
           2       0.70      0.76      0.73      1813

    accuracy                           0.74      5874
   macro avg       0.74      0.73      0.74      5874
weighted avg       0.74      0.74      0.74      5874



In [28]:
model__prediction('tf-idf vectorizer model (test):', y_test, y_test_pred_tfidf)

tf-idf vectorizer model (test):
              precision    recall  f1-score   support

           0       0.79      0.82      0.80      2370
           1       0.83      0.77      0.80      1691
           2       0.79      0.79      0.79      1813

    accuracy                           0.80      5874
   macro avg       0.80      0.80      0.80      5874
weighted avg       0.80      0.80      0.80      5874



In [29]:
model__prediction('one-hot encoding model (test):', y_test, y_test_pred_onehot)

one-hot encoding model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2370
           1       0.77      0.71      0.74      1691
           2       0.70      0.76      0.73      1813

    accuracy                           0.74      5874
   macro avg       0.74      0.73      0.74      5874
weighted avg       0.74      0.74      0.74      5874



temps de prediction

In [30]:
start_time = time.time()
mlp_count.predict(x_test_cv)
print(f"count vectorizer prediction time: {time.time() - start_time} seconds")


count vectorizer prediction time: 1.506387710571289 seconds


In [31]:
start_time = time.time()
mlp_tfidf.predict(x_test_tfidf)
print(f"tfidf vectorizer prediction time: {time.time() - start_time} seconds")

tfidf vectorizer prediction time: 1.01865553855896 seconds


In [32]:
start_time = time.time()
mlp_onehot.predict(x_test_cv)
print(f"One-Hot Encoding Prediction Time: {time.time() - start_time} seconds")

One-Hot Encoding Prediction Time: 1.5098521709442139 seconds


## G. vectorisation (embeddings de mots)

In [33]:
# hyperparameters
WINDOW_SIZE = 2
EMBEDDING_DIM = 5
LEARNING_RATE = 0.01
EPOCHS = 50

In [34]:
words = [word for sentence in x_train for word in sentence.split()]
words

['upon',
 'these',
 'two',
 'words',
 'therefore',
 'i',
 'have',
 'mainly',
 'built',
 'my',
 'hopes',
 'of',
 'a',
 'full',
 'solution',
 'of',
 'the',
 'ridle',
 'i',
 'had',
 'the',
 'good',
 'fortune',
 'to',
 'recollect',
 'that',
 'in',
 'the',
 'accentuation',
 'of',
 'this',
 'drama',
 'or',
 'at',
 'least',
 'of',
 'such',
 'portion',
 'of',
 'it',
 'as',
 'is',
 'allotted',
 'to',
 'the',
 'hero',
 'the',
 'tones',
 'of',
 'voice',
 'in',
 'which',
 'i',
 'found',
 'myself',
 'deficient',
 'were',
 'altogether',
 'unecesary',
 'and',
 'the',
 'deep',
 'guttural',
 'was',
 'expected',
 'to',
 'reign',
 'monotonously',
 'throughout',
 'each',
 'one',
 'of',
 'a',
 'gang',
 'so',
 'placed',
 'is',
 'not',
 'so',
 'much',
 'greedy',
 'of',
 'reward',
 'or',
 'anxious',
 'for',
 'escape',
 'as',
 'fearful',
 'of',
 'betrayal',
 'if',
 'he',
 'were',
 'vanquished',
 'i',
 'should',
 'be',
 'a',
 'free',
 'man',
 'among',
 'a',
 'multitude',
 'of',
 'opinions',
 'upon',
 'this',
 '

In [35]:
vocab = list(set(words))
vocab

['subtlest',
 'bog',
 'necronomicon',
 'frosts',
 'rumbled',
 'company',
 'glory',
 'epiphanes',
 'spotted',
 'damascus',
 'undiscovered',
 'previously',
 'voices',
 'jura',
 'timers',
 'unkind',
 'big',
 'feasibility',
 'despondency',
 'tragedy',
 'proboscis',
 'democracy',
 'faintly',
 'destroyer',
 'acquiring',
 'overpowering',
 'commemorates',
 'heh',
 'overdue',
 'contented',
 'lies',
 'confide',
 'proportioned',
 'repeople',
 'existing',
 'visibility',
 'helped',
 'remoter',
 'greyed',
 'setting',
 'greeks',
 'parterres',
 'starting',
 'unrecaled',
 'unmixed',
 'smiling',
 'allah',
 'forbidding',
 'cows',
 'flogings',
 'conjure',
 'gouty',
 'courtesy',
 'necesary',
 'winning',
 'latour',
 'spectral',
 'volcanically',
 'affixed',
 'porphyry',
 'foreigner',
 'mosaiques',
 'unfortunate',
 'cease',
 'magnanimity',
 'poetic',
 'grind',
 'overwhelmingly',
 'hoax',
 'imbedded',
 'inspirited',
 'analogies',
 'shot',
 'misfortunes',
 'syrian',
 'charms',
 'slackened',
 'warmly',
 'bent',


In [36]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx

{'subtlest': 0,
 'bog': 1,
 'necronomicon': 2,
 'frosts': 3,
 'rumbled': 4,
 'company': 5,
 'glory': 6,
 'epiphanes': 7,
 'spotted': 8,
 'damascus': 9,
 'undiscovered': 10,
 'previously': 11,
 'voices': 12,
 'jura': 13,
 'timers': 14,
 'unkind': 15,
 'big': 16,
 'feasibility': 17,
 'despondency': 18,
 'tragedy': 19,
 'proboscis': 20,
 'democracy': 21,
 'faintly': 22,
 'destroyer': 23,
 'acquiring': 24,
 'overpowering': 25,
 'commemorates': 26,
 'heh': 27,
 'overdue': 28,
 'contented': 29,
 'lies': 30,
 'confide': 31,
 'proportioned': 32,
 'repeople': 33,
 'existing': 34,
 'visibility': 35,
 'helped': 36,
 'remoter': 37,
 'greyed': 38,
 'setting': 39,
 'greeks': 40,
 'parterres': 41,
 'starting': 42,
 'unrecaled': 43,
 'unmixed': 44,
 'smiling': 45,
 'allah': 46,
 'forbidding': 47,
 'cows': 48,
 'flogings': 49,
 'conjure': 50,
 'gouty': 51,
 'courtesy': 52,
 'necesary': 53,
 'winning': 54,
 'latour': 55,
 'spectral': 56,
 'volcanically': 57,
 'affixed': 58,
 'porphyry': 59,
 'foreigner'

In [37]:
idx2word = {i: word for word, i in word2idx.items()}
idx2word

{0: 'subtlest',
 1: 'bog',
 2: 'necronomicon',
 3: 'frosts',
 4: 'rumbled',
 5: 'company',
 6: 'glory',
 7: 'epiphanes',
 8: 'spotted',
 9: 'damascus',
 10: 'undiscovered',
 11: 'previously',
 12: 'voices',
 13: 'jura',
 14: 'timers',
 15: 'unkind',
 16: 'big',
 17: 'feasibility',
 18: 'despondency',
 19: 'tragedy',
 20: 'proboscis',
 21: 'democracy',
 22: 'faintly',
 23: 'destroyer',
 24: 'acquiring',
 25: 'overpowering',
 26: 'commemorates',
 27: 'heh',
 28: 'overdue',
 29: 'contented',
 30: 'lies',
 31: 'confide',
 32: 'proportioned',
 33: 'repeople',
 34: 'existing',
 35: 'visibility',
 36: 'helped',
 37: 'remoter',
 38: 'greyed',
 39: 'setting',
 40: 'greeks',
 41: 'parterres',
 42: 'starting',
 43: 'unrecaled',
 44: 'unmixed',
 45: 'smiling',
 46: 'allah',
 47: 'forbidding',
 48: 'cows',
 49: 'flogings',
 50: 'conjure',
 51: 'gouty',
 52: 'courtesy',
 53: 'necesary',
 54: 'winning',
 55: 'latour',
 56: 'spectral',
 57: 'volcanically',
 58: 'affixed',
 59: 'porphyry',
 60: 'foreig

In [38]:
vocab_size = len(vocab)
vocab_size

23413

In [39]:
def generate_training_data(corpus, window_size):
    training_data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i, target in enumerate(tokens):
            context_start = max(0, i - window_size)
            context_end = min(len(tokens), i + window_size + 1)
            for j in range(context_start, context_end):
                if i != j:
                    training_data.append((target, tokens[j]))
    return training_data

training_pairs = generate_training_data(x_train, WINDOW_SIZE)
training_pairs

[('upon', 'these'),
 ('upon', 'two'),
 ('these', 'upon'),
 ('these', 'two'),
 ('these', 'words'),
 ('two', 'upon'),
 ('two', 'these'),
 ('two', 'words'),
 ('two', 'therefore'),
 ('words', 'these'),
 ('words', 'two'),
 ('words', 'therefore'),
 ('words', 'i'),
 ('therefore', 'two'),
 ('therefore', 'words'),
 ('therefore', 'i'),
 ('therefore', 'have'),
 ('i', 'words'),
 ('i', 'therefore'),
 ('i', 'have'),
 ('i', 'mainly'),
 ('have', 'therefore'),
 ('have', 'i'),
 ('have', 'mainly'),
 ('have', 'built'),
 ('mainly', 'i'),
 ('mainly', 'have'),
 ('mainly', 'built'),
 ('mainly', 'my'),
 ('built', 'have'),
 ('built', 'mainly'),
 ('built', 'my'),
 ('built', 'hopes'),
 ('my', 'mainly'),
 ('my', 'built'),
 ('my', 'hopes'),
 ('my', 'of'),
 ('hopes', 'built'),
 ('hopes', 'my'),
 ('hopes', 'of'),
 ('hopes', 'a'),
 ('of', 'my'),
 ('of', 'hopes'),
 ('of', 'a'),
 ('of', 'full'),
 ('a', 'hopes'),
 ('a', 'of'),
 ('a', 'full'),
 ('a', 'solution'),
 ('full', 'of'),
 ('full', 'a'),
 ('full', 'solution'),
 ('

In [40]:
def one_hot_encode(word, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

In [41]:
W1 = np.random.uniform(-1, 1, (vocab_size, EMBEDDING_DIM)) 
W2 = np.random.uniform(-1, 1, (EMBEDDING_DIM, vocab_size)) 

In [42]:
for epoch in range(EPOCHS):
    total_loss = 0
    for target_word, context_word in training_pairs:

        target_vec = one_hot_encode(target_word, vocab_size)
        hidden_layer = np.dot(target_vec, W1) 
        output_layer = np.dot(hidden_layer, W2) 
        predictions = np.exp(output_layer) / np.sum(np.exp(output_layer)) 

        target_label = one_hot_encode(context_word, vocab_size)
        loss = -np.sum(target_label * np.log(predictions + 1e-9))
        total_loss += loss

        error = predictions - target_label
        W2 -= LEARNING_RATE * np.outer(hidden_layer, error)
        W1 -= LEARNING_RATE * np.outer(target_vec, np.dot(W2, error))

    if epoch % 500 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

KeyboardInterrupt: 

In [None]:
word_embeddings = {word: W1[word2idx[word]] for word in vocab}
word_embeddings

In [None]:
for word, embedding in word_embeddings.items():
    print(f"{word}: {embedding}")

word2vec

In [None]:
tokenized_text = [sentence.split() for sentence in x_train]

word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

x_train_word2vec = [vectorize_text(text, word2vec_model) for text in tokenized_text]

glove

fastText

In [None]:
fasttext_model = FastText(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

x_train_fasttext = [vectorize_text(text, fasttext_model) for text in tokenized_text]

## H. entrainement/test

In [None]:
def get_average_vector(text, model):
    words = word_tokenize(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [None]:
x_train_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_train])
x_test_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_test])

In [None]:
x_train_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_train])
x_test_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_test])

result

In [None]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [None]:
print("Word2Vec Results:")
train_and_evaluate(x_train_word2vec, x_test_word2vec, y_train, y_test)

Word2Vec Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      5529
           1       0.00      0.00      0.00      3944
           2       0.00      0.00      0.00      4230

    accuracy                           0.40     13703
   macro avg       0.13      0.33      0.19     13703
weighted avg       0.16      0.40      0.23     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.00      0.00      0.00      1813

    accuracy                           0.40      5873
   macro avg       0.13      0.33      0.19      5873
weighted avg       0.16      0.40      0.23      5873

Prediction time: 0.00778 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
print("FastText Results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

FastText Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      5529
           1       0.67      0.00      0.00      3944
           2       0.33      0.02      0.03      4230

    accuracy                           0.40     13703
   macro avg       0.47      0.33      0.20     13703
weighted avg       0.46      0.40      0.24     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.29      0.02      0.03      1813

    accuracy                           0.40      5873
   macro avg       0.23      0.33      0.20      5873
weighted avg       0.25      0.40      0.24      5873

Prediction time: 0.01157 seconds
