# feature extraction and embeddings

In [41]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from gensim.models.keyedvectors import KeyedVectors
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random
import time

## A. preparation de donnees

In [42]:
df = pd.read_csv('./spooky.csv')

In [43]:
df.head()

Unnamed: 0,id,text,author
0,id26305,this proces however afforded me no means of as...,EAP
1,id17569,it never once occurred to me that the fumbling...,HPL
2,id11008,in his left hand was a gold snuff box from whi...,EAP
3,id27763,how lovely is spring as we looked from windsor...,MWS
4,id12958,finding nothing else not even gold the superin...,HPL


In [44]:
df['text'].isnull().sum()

0

## B. encodage de la variable a predire (facultatif)

In [45]:
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['author'])

In [46]:
df.head()

Unnamed: 0,id,text,author,author_encoded
0,id26305,this proces however afforded me no means of as...,EAP,0
1,id17569,it never once occurred to me that the fumbling...,HPL,1
2,id11008,in his left hand was a gold snuff box from whi...,EAP,0
3,id27763,how lovely is spring as we looked from windsor...,MWS,2
4,id12958,finding nothing else not even gold the superin...,HPL,1


## C. construction des bases d’entraînement et de test

 tarining & test dataset

In [55]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df['author_encoded'], test_size=0.3, random_state=0, stratify = df['author_encoded'])

obtenir une repartition similaire dans chaque classe du dataset

In [56]:
y_train.value_counts()

author_encoded
0    5530
2    4231
1    3944
Name: count, dtype: int64

In [57]:
y_test.value_counts()

author_encoded
0    2370
2    1813
1    1691
Name: count, dtype: int64

In [58]:
y_train.value_counts()/y_test.value_counts()

author_encoded
0    2.333333
2    2.333701
1    2.332348
Name: count, dtype: float64

## D. methodes de vectorisation

vectoriser tarining & test dataset

In [59]:
# binary false for frequence lexicale, binary true for one-hot encoding
count_vectorizer = CountVectorizer(binary=False,analyzer= 'word', stop_words='english')

In [60]:
x_train_cv = count_vectorizer.fit_transform(x_train)
x_test_cv = count_vectorizer.transform(x_test)

In [61]:
x_train_cv.shape, x_test_cv.shape

((13705, 23082), (5874, 23082))

In [62]:
x_train_cv = pd.DataFrame(data = x_train_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_train_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
x_test_cv = pd.DataFrame(data = x_test_cv.toarray(), columns = count_vectorizer.get_feature_names_out())
x_test_cv.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


TF-IDF

In [64]:
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [65]:
x_train_tfidf = pd.DataFrame(data = x_train_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_train_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
x_test_tfidf = pd.DataFrame(data = x_test_tfidf.toarray(), columns = tfidf_vectorizer.get_feature_names_out())
x_test_tfidf.head()

Unnamed: 0,ab,abandon,abandoned,abandoning,abandonment,abaout,abasement,abashment,abate,abated,...,zit,zobna,zobnarian,zodiacal,zokar,zone,zones,zopyrus,zubmizion,zuro
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## E. entrainement

cree 3 models du type MLP classifier

In [67]:
# one hidden layer with 100 neurons
mlp_count =  MLPClassifier(hidden_layer_sizes=(100,), max_iter=100, solver='adam', random_state=1)

# two hidden layers with 50 and 40 neurons
mlp_tfidf = MLPClassifier(hidden_layer_sizes=(50, 40), max_iter=100, solver='adam', random_state=1)

# three hidden layers with 50, 40 and 30 neurons
mlp_onehot = MLPClassifier(hidden_layer_sizes=(50, 40, 30), max_iter=100, solver='adam', random_state=1)

entrainement

In [68]:
mlp_count.fit(x_train_cv, y_train)
mlp_tfidf.fit(x_train_tfidf, y_train)
mlp_onehot.fit(x_train_cv, y_train) 

prediction (training)

In [69]:
y_train_pred_count = mlp_count.predict(x_train_cv)
y_train_pred_tfidf = mlp_tfidf.predict(x_train_tfidf)
y_train_pred_onehot = mlp_onehot.predict(x_train_cv)

rapport de classification

In [70]:
def model__prediction(title, y_train, y_train_pred):
    print(title)
    print(classification_report(y_train, y_train_pred))

In [71]:
model__prediction('count vectorizer model:', y_train, y_train_pred_count)

count vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [72]:
model__prediction('tf-idf vectorizer model:', y_train, y_train_pred_tfidf)

tf-idf vectorizer model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



In [73]:
model__prediction('one-hot encoding model:', y_train, y_train_pred_onehot)

one-hot encoding model:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5530
           1       1.00      1.00      1.00      3944
           2       1.00      1.00      1.00      4231

    accuracy                           1.00     13705
   macro avg       1.00      1.00      1.00     13705
weighted avg       1.00      1.00      1.00     13705



## F. test

prediction (test)

In [74]:
y_test_pred_count = mlp_count.predict(x_test_cv)
y_test_pred_tfidf = mlp_tfidf.predict(x_test_tfidf)
y_test_pred_onehot = mlp_onehot.predict(x_test_cv)

rapport de classification

In [75]:
model__prediction('count vectorizer model (test):', y_test, y_test_pred_count)

count vectorizer model (test):
              precision    recall  f1-score   support

           0       0.74      0.74      0.74      2370
           1       0.77      0.71      0.74      1691
           2       0.70      0.76      0.73      1813

    accuracy                           0.74      5874
   macro avg       0.74      0.73      0.74      5874
weighted avg       0.74      0.74      0.74      5874



In [76]:
model__prediction('tf-idf vectorizer model (test):', y_test, y_test_pred_tfidf)

tf-idf vectorizer model (test):
              precision    recall  f1-score   support

           0       0.79      0.83      0.81      2370
           1       0.84      0.78      0.81      1691
           2       0.79      0.80      0.80      1813

    accuracy                           0.81      5874
   macro avg       0.81      0.80      0.81      5874
weighted avg       0.81      0.81      0.81      5874



In [78]:
model__prediction('one-hot encoding model (test):', y_test, y_test_pred_onehot)

one-hot encoding model (test):
              precision    recall  f1-score   support

           0       0.76      0.75      0.75      2370
           1       0.79      0.73      0.76      1691
           2       0.72      0.77      0.75      1813

    accuracy                           0.75      5874
   macro avg       0.75      0.75      0.75      5874
weighted avg       0.75      0.75      0.75      5874



temps de prediction

In [79]:
start_time = time.time()
mlp_count.predict(x_test_cv)
print(f"count vectorizer prediction time: {time.time() - start_time} seconds")


count vectorizer prediction time: 2.5244197845458984 seconds


In [80]:
start_time = time.time()
mlp_tfidf.predict(x_test_tfidf)
print(f"tfidf vectorizer prediction time: {time.time() - start_time} seconds")

tfidf vectorizer prediction time: 1.8447535037994385 seconds


In [81]:
start_time = time.time()
mlp_onehot.predict(x_test_cv)
print(f"One-Hot Encoding Prediction Time: {time.time() - start_time} seconds")

One-Hot Encoding Prediction Time: 1.9984557628631592 seconds


## G. vectorisation (embeddings de mots)

In [55]:
# hyperparameters
WINDOW_SIZE = 2
EMBEDDING_DIM = 5
LEARNING_RATE = 0.01
EPOCHS = 50

In [42]:
words = [word for sentence in x_train for word in sentence.split()]
words

['inside',
 'wrapped',
 'discoloured',
 'parchment',
 'huge',
 'key',
 'tarnished',
 'silver',
 'covered',
 'cryptical',
 'arabesques',
 'legible',
 'explanation',
 'none',
 'somewhere',
 'ought',
 'strayed',
 'far',
 'away',
 'places',
 'belonged',
 'inexcusably',
 'late',
 'material',
 'world',
 'continued',
 'dupin',
 'abounds',
 'strict',
 'analogies',
 'immaterial',
 'thus',
 'color',
 'truth',
 'given',
 'rhetorical',
 'dogma',
 'metaphor',
 'simile',
 'may',
 'made',
 'strengthen',
 'argument',
 'well',
 'embellish',
 'description',
 'alas',
 'ye',
 'last',
 'happiness',
 'ever',
 'enjoyed',
 'weeks',
 'destroyed',
 'visible',
 'houses',
 'could',
 'tell',
 'state',
 'road',
 'traffic',
 'light',
 'hereabouts',
 'great',
 'reliance',
 'however',
 'extreme',
 'delicacy',
 'discrimination',
 'matters',
 'appertaining',
 'rules',
 'etiquette',
 'long',
 'pre',
 'eminently',
 'distinguished',
 'time',
 'totally',
 'destroyed',
 'three',
 'different',
 'periods',
 'three',
 'successi

In [43]:
vocab = list(set(words))
vocab

['tantalisingly',
 'powerfully',
 'spicy',
 'spades',
 'clatter',
 'dipt',
 'split',
 'quintessence',
 'manly',
 'positive',
 'harmoniously',
 'parfumerie',
 'affecting',
 'somnambula',
 'rioters',
 'rope',
 'barring',
 'bar',
 'almaviva',
 'suggests',
 'metallic',
 'constitute',
 'actual',
 'deserted',
 'deceived',
 'inclemency',
 'wrestled',
 'rencontre',
 'pitchforks',
 'lovea',
 'flown',
 'noses',
 'erroneously',
 'swilled',
 'convenience',
 'determinate',
 'hermitess',
 'palm',
 'enthralled',
 'exchanging',
 'slatternly',
 'unlearned',
 'circumstantial',
 'worsted',
 'headache',
 'totum',
 'tasted',
 'armies',
 'eyelashes',
 'wronged',
 'box',
 'rochester',
 'delicious',
 'bewilderment',
 'competing',
 'tolled',
 'slouched',
 'scope',
 'weary',
 'circumlocution',
 'gerrit',
 'burgomasters',
 'happening',
 'xari',
 'philosophers',
 'seventh',
 'gamekeeper',
 'charioted',
 'luther',
 'madeline',
 'tearing',
 'possibility',
 'sham',
 'answers',
 'centre',
 'massed',
 'domicile',
 'un

In [44]:
word2idx = {word: i for i, word in enumerate(vocab)}
word2idx

{'tantalisingly': 0,
 'powerfully': 1,
 'spicy': 2,
 'spades': 3,
 'clatter': 4,
 'dipt': 5,
 'split': 6,
 'quintessence': 7,
 'manly': 8,
 'positive': 9,
 'harmoniously': 10,
 'parfumerie': 11,
 'affecting': 12,
 'somnambula': 13,
 'rioters': 14,
 'rope': 15,
 'barring': 16,
 'bar': 17,
 'almaviva': 18,
 'suggests': 19,
 'metallic': 20,
 'constitute': 21,
 'actual': 22,
 'deserted': 23,
 'deceived': 24,
 'inclemency': 25,
 'wrestled': 26,
 'rencontre': 27,
 'pitchforks': 28,
 'lovea': 29,
 'flown': 30,
 'noses': 31,
 'erroneously': 32,
 'swilled': 33,
 'convenience': 34,
 'determinate': 35,
 'hermitess': 36,
 'palm': 37,
 'enthralled': 38,
 'exchanging': 39,
 'slatternly': 40,
 'unlearned': 41,
 'circumstantial': 42,
 'worsted': 43,
 'headache': 44,
 'totum': 45,
 'tasted': 46,
 'armies': 47,
 'eyelashes': 48,
 'wronged': 49,
 'box': 50,
 'rochester': 51,
 'delicious': 52,
 'bewilderment': 53,
 'competing': 54,
 'tolled': 55,
 'slouched': 56,
 'scope': 57,
 'weary': 58,
 'circumlocuti

In [45]:
idx2word = {i: word for word, i in word2idx.items()}
idx2word

{0: 'tantalisingly',
 1: 'powerfully',
 2: 'spicy',
 3: 'spades',
 4: 'clatter',
 5: 'dipt',
 6: 'split',
 7: 'quintessence',
 8: 'manly',
 9: 'positive',
 10: 'harmoniously',
 11: 'parfumerie',
 12: 'affecting',
 13: 'somnambula',
 14: 'rioters',
 15: 'rope',
 16: 'barring',
 17: 'bar',
 18: 'almaviva',
 19: 'suggests',
 20: 'metallic',
 21: 'constitute',
 22: 'actual',
 23: 'deserted',
 24: 'deceived',
 25: 'inclemency',
 26: 'wrestled',
 27: 'rencontre',
 28: 'pitchforks',
 29: 'lovea',
 30: 'flown',
 31: 'noses',
 32: 'erroneously',
 33: 'swilled',
 34: 'convenience',
 35: 'determinate',
 36: 'hermitess',
 37: 'palm',
 38: 'enthralled',
 39: 'exchanging',
 40: 'slatternly',
 41: 'unlearned',
 42: 'circumstantial',
 43: 'worsted',
 44: 'headache',
 45: 'totum',
 46: 'tasted',
 47: 'armies',
 48: 'eyelashes',
 49: 'wronged',
 50: 'box',
 51: 'rochester',
 52: 'delicious',
 53: 'bewilderment',
 54: 'competing',
 55: 'tolled',
 56: 'slouched',
 57: 'scope',
 58: 'weary',
 59: 'circumlo

In [47]:
vocab_size = len(vocab)
vocab_size

21939

In [50]:
def generate_training_data(corpus, window_size):
    training_data = []
    for sentence in corpus:
        tokens = sentence.split()
        for i, target in enumerate(tokens):
            context_start = max(0, i - window_size)
            context_end = min(len(tokens), i + window_size + 1)
            for j in range(context_start, context_end):
                if i != j:
                    training_data.append((target, tokens[j]))
    return training_data

training_pairs = generate_training_data(x_train, WINDOW_SIZE)
training_pairs

[('inside', 'wrapped'),
 ('inside', 'discoloured'),
 ('wrapped', 'inside'),
 ('wrapped', 'discoloured'),
 ('wrapped', 'parchment'),
 ('discoloured', 'inside'),
 ('discoloured', 'wrapped'),
 ('discoloured', 'parchment'),
 ('discoloured', 'huge'),
 ('parchment', 'wrapped'),
 ('parchment', 'discoloured'),
 ('parchment', 'huge'),
 ('parchment', 'key'),
 ('huge', 'discoloured'),
 ('huge', 'parchment'),
 ('huge', 'key'),
 ('huge', 'tarnished'),
 ('key', 'parchment'),
 ('key', 'huge'),
 ('key', 'tarnished'),
 ('key', 'silver'),
 ('tarnished', 'huge'),
 ('tarnished', 'key'),
 ('tarnished', 'silver'),
 ('tarnished', 'covered'),
 ('silver', 'key'),
 ('silver', 'tarnished'),
 ('silver', 'covered'),
 ('silver', 'cryptical'),
 ('covered', 'tarnished'),
 ('covered', 'silver'),
 ('covered', 'cryptical'),
 ('covered', 'arabesques'),
 ('cryptical', 'silver'),
 ('cryptical', 'covered'),
 ('cryptical', 'arabesques'),
 ('cryptical', 'legible'),
 ('arabesques', 'covered'),
 ('arabesques', 'cryptical'),
 ('

In [51]:
def one_hot_encode(word, vocab_size):
    vec = np.zeros(vocab_size)
    vec[word2idx[word]] = 1
    return vec

In [54]:
W1 = np.random.uniform(-1, 1, (vocab_size, EMBEDDING_DIM)) 
W2 = np.random.uniform(-1, 1, (EMBEDDING_DIM, vocab_size)) 

In [None]:
for epoch in range(EPOCHS):
    total_loss = 0
    for target_word, context_word in training_pairs:

        target_vec = one_hot_encode(target_word, vocab_size)
        hidden_layer = np.dot(target_vec, W1) 
        output_layer = np.dot(hidden_layer, W2) 
        predictions = np.exp(output_layer) / np.sum(np.exp(output_layer)) 

        target_label = one_hot_encode(context_word, vocab_size)
        loss = -np.sum(target_label * np.log(predictions + 1e-9))
        total_loss += loss

        error = predictions - target_label
        W2 -= LEARNING_RATE * np.outer(hidden_layer, error)
        W1 -= LEARNING_RATE * np.outer(target_vec, np.dot(W2, error))

    if epoch % 500 == 0:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

In [None]:
word_embeddings = {word: W1[word2idx[word]] for word in vocab}
word_embeddings

In [None]:
for word, embedding in word_embeddings.items():
    print(f"{word}: {embedding}")

word2vec

In [63]:
tokenized_text = [sentence.split() for sentence in x_train]

word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_text(text, model):
    vectors = [model.wv[word] for word in text if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else [0] * model.vector_size

x_train_word2vec = [vectorize_text(text, word2vec_model) for text in tokenized_text]

glove

fastText

In [64]:
fasttext_model = FastText(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

x_train_fasttext = [vectorize_text(text, fasttext_model) for text in tokenized_text]

## H. entrainement/test

In [65]:
def get_average_vector(text, model):
    words = word_tokenize(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [66]:
x_train_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_train])
x_test_word2vec = np.vstack([get_average_vector(text, word2vec_model) for text in x_test])

In [67]:
x_train_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_train])
x_test_fasttext = np.vstack([get_average_vector(text, fasttext_model) for text in x_test])

result

In [68]:
def train_and_evaluate(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=0)
    mlp.fit(X_train, y_train)
    
    y_train_pred = mlp.predict(X_train)
    y_test_pred = mlp.predict(X_test)
    
    print("Train Classification Report:")
    print(classification_report(y_train, y_train_pred))
    print("Test Classification Report:")
    print(classification_report(y_test, y_test_pred))
    
    start_time = time.time()
    mlp.predict(X_test)
    print(f"Prediction time: {time.time() - start_time:.5f} seconds")

In [69]:
print("Word2Vec Results:")
train_and_evaluate(x_train_word2vec, x_test_word2vec, y_train, y_test)

Word2Vec Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      5529
           1       0.00      0.00      0.00      3944
           2       0.00      0.00      0.00      4230

    accuracy                           0.40     13703
   macro avg       0.13      0.33      0.19     13703
weighted avg       0.16      0.40      0.23     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      1.00      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.00      0.00      0.00      1813

    accuracy                           0.40      5873
   macro avg       0.13      0.33      0.19      5873
weighted avg       0.16      0.40      0.23      5873

Prediction time: 0.00778 seconds


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [70]:
print("FastText Results:")
train_and_evaluate(x_train_fasttext, x_test_fasttext, y_train, y_test)

FastText Results:
Train Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      5529
           1       0.67      0.00      0.00      3944
           2       0.33      0.02      0.03      4230

    accuracy                           0.40     13703
   macro avg       0.47      0.33      0.20     13703
weighted avg       0.46      0.40      0.24     13703

Test Classification Report:
              precision    recall  f1-score   support

           0       0.40      0.98      0.57      2369
           1       0.00      0.00      0.00      1691
           2       0.29      0.02      0.03      1813

    accuracy                           0.40      5873
   macro avg       0.23      0.33      0.20      5873
weighted avg       0.25      0.40      0.24      5873

Prediction time: 0.01157 seconds
