In [2]:
import importlib
import data_preprocessing  # Make sure it's already imported
importlib.reload(data_preprocessing)  # Force reload

import pandas as pd
import numpy as np
import data_preprocessing
from data_preprocessing import clean_text, correct_spelling, replace_emoji, lemmatize_text

In [3]:
df = pd.read_parquet("dataset.parquet")
label_df = df[df['label'] == True]

import pandas as pd
from sklearn.model_selection import train_test_split

df_train_label, df_test_label = train_test_split(label_df, test_size=0.2,random_state=42)

# data preprocessing, skip lemmanization for bert and sbert
def process_text_pipeline(text, country):
    text = clean_text(text)
    text = correct_spelling(text, country)
    text = replace_emoji(text, country)
    text = lemmatize_text(text, country)
    return text

# process for df_train
processed_texts = []
# Iterate through the DataFrame rows
for index, row in df_train_label.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

# Assign lists back to the DataFrame
df_train_label['processed_text_nsc'] = processed_texts

# Repeat for df_test
processed_texts = []

for index, row in df_test_label.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

df_test_label['processed_text_nsc'] = processed_texts


In [None]:
import gensim.downloader as api

# Download and load the pretrained model
model = api.load("word2vec-google-news-300")


In [7]:
import spacy

# Load language models
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')

# Tokenization function
def spacy_tokenize(text, lang='en'):
    nlp = nlp_fr if lang == 'fr' else nlp_en
    return [token.text for token in nlp(text)]

# Apply tokenization based on country_name
df_train_label['tokenized_text'] = df_train_label.apply(
    lambda row: spacy_tokenize(row['processed_text_nsc'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)
df_test_label['tokenized_text'] = df_test_label.apply(
    lambda row: spacy_tokenize(row['processed_text_nsc'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)

In [10]:
def get_word2vec_embedding(tokens, model):
    """
    Compute the average Word2Vec embedding for a list of tokens.
    
    Parameters:
        tokens (list): List of tokenized words from a sentence.
        model (gensim KeyedVectors): Pretrained Word2Vec model.
    
    Returns:
        numpy array: Averaged word vector of fixed size.
    """
    valid_vectors = [model[token] for token in tokens if token in model]  # ✅ FIXED

    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Ensure fixed-size output

# Apply function to DataFrame
df_train_label["word2vec_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, model))
df_test_label["word2vec_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, model))


In [38]:
from sklearn.cluster import KMeans
import numpy as np

df_train_label['data_word2vec_embedding'] = df_train_label['data_word2vec_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))
# Stack embeddings properly
X = np.vstack(df_train_label['data_word2vec_embedding'].values)

# Apply K-Means clustering
k = 11  # Number of clusters, 10 topics add out of topic
kmeans = KMeans(n_clusters=k, random_state=42)
df_train_label['cluster'] = kmeans.fit_predict(X)

In [39]:
# Group by cluster and topic_id to see distribution
cluster_topic_distribution = df_train_label.groupby(['cluster', 'topic_id']).size().reset_index(name='count')

# Find the most common topic_id in each cluster
cluster_to_topic = cluster_topic_distribution.sort_values('count', ascending=False).drop_duplicates('cluster')

# Create a mapping from cluster to topic_id
cluster_topic_mapping = dict(zip(cluster_to_topic['cluster'], cluster_to_topic['topic_id']))

# Assign the matched topic_id back to the DataFrame
df_train_label['matched_topic_id'] = df_train_label['cluster'].map(cluster_topic_mapping)

# Check the mapping
print(df_train_label[['quote_text', 'cluster', 'topic_id', 'matched_topic_id']].head())

                                              quote_text  cluster  topic_id  \
38030  The face cream was easy to pump out and I didn...        4     550.0   
50679  Très bonne idée la recharge, pour partir en vo...       10     556.0   
35584  J’utilise ce produit depuis 2 mois maintenant ...        1     546.0   
44679  This set of 3 masks comes in a fully recycled ...        5     602.0   
33645  Points forts    Format automatique pratique, s...       10     546.0   

       matched_topic_id  
38030             550.0  
50679             602.0  
35584             602.0  
44679             602.0  
33645             602.0  


In [40]:
# predict for df_test
df_test_label['data_word2vec_embedding'] = df_test_label['data_word2vec_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))
X = np.vstack(df_test_label['data_word2vec_embedding'].values)#.astype(np.float64)
df_test_label['cluster'] = kmeans.predict(X)
# map to the topic id
df_test_label['matched_topic_id'] = df_test_label['cluster'].map(cluster_topic_mapping)

In [22]:
train_accuracy = np.sum(df_train_label['matched_topic_id'] == df_train_label['topic_id'])/len(df_train_label)
test_accuracy = np.sum(df_test_label['matched_topic_id'] == df_test_label['topic_id'])/len(df_test_label)
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

train accuracy: 0.2907732921254266
test accuracy: 0.28689157867628123


**tune the pretrained model**

In [23]:
from gensim.models import Word2Vec

# Convert KeyedVectors to a trainable Word2Vec model
word2vec_model = Word2Vec(vector_size=300, window=5, min_count=1)  # Keep same vector size
word2vec_model.build_vocab_from_freq(model.key_to_index)  # Load pretrained vocab
word2vec_model.build_vocab(df_train_label['tokenized_text'], update=True)  # Update vocab with new sentences


In [26]:
# this rate can be tuned
word2vec_model.train(df_train_label['tokenized_text'], total_examples=len(df_train_label['tokenized_text']), epochs=5)

(1204061, 1707595)

In [28]:
# Convert fine-tuned Word2Vec model to KeyedVectors
fine_tuned_keyed_vectors = word2vec_model.wv

df_train_label["fine_word2vec_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fine_tuned_keyed_vectors))
df_test_label["fine_word2vec_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fine_tuned_keyed_vectors))

In [34]:
train_accuracy = np.sum(df_train_label['matched_topic_id'] == df_train_label['topic_id'])/len(df_train_label)
test_accuracy = np.sum(df_test_label['matched_topic_id'] == df_test_label['topic_id'])/len(df_test_label)
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

train accuracy: 0.20198313051316721
test accuracy: 0.1916044295647695


**what if we just use the one trained by data?**

In [36]:
data_w2v_model = Word2Vec(sentences=df_train_label['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

In [37]:
data_keyed_vectors = data_w2v_model.wv

df_train_label["data_word2vec_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))
df_test_label["data_word2vec_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))

In [41]:
train_accuracy = np.sum(df_train_label['matched_topic_id'] == df_train_label['topic_id'])/len(df_train_label)
test_accuracy = np.sum(df_test_label['matched_topic_id'] == df_test_label['topic_id'])/len(df_test_label)
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

train accuracy: 0.3167213959178417
test accuracy: 0.3139325263971156


**let's try by tuning it with the full data set, perform classification on the labeled ones**

In [43]:
df = pd.read_parquet("dataset.parquet")

import pandas as pd
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2,random_state=42, stratify=df['label'] )


In [None]:
# process for df_train
processed_texts = []
# Iterate through the DataFrame rows
for index, row in df_train.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

# Assign lists back to the DataFrame
df_train['processed_text_nsc'] = processed_texts

# Repeat for df_test
processed_texts = []

for index, row in df_test.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

df_test['processed_text_nsc'] = processed_texts

In [53]:
# Apply tokenization based on country_name
df_train['tokenized_text'] = df_train.apply(
    lambda row: spacy_tokenize(row['processed_text_nsc'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)
df_test['tokenized_text'] = df_test.apply(
    lambda row: spacy_tokenize(row['processed_text_nsc'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)

In [54]:
data_w2v_model = Word2Vec(sentences=df_train['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

In [55]:
data_keyed_vectors = data_w2v_model.wv

df_train_label = df_train[df_train['label'] == True]
df_test_label = df_test[df_test['label'] == True]

df_train_label["data_word2vec_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))
df_test_label["data_word2vec_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_label["data_word2vec_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_label["data_word2vec_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, data_keyed_vectors))


In [None]:
from sklearn.cluster import KMeans
import numpy as np

df_train_label['fasttext_embedding'] = df_train_label['fasttext_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))
# Stack embeddings properly
X = np.vstack(df_train_label['fasttext_embedding'].values)

# Apply K-Means clustering
k = 11  # Number of clusters, 10 topics add out of topic
kmeans = KMeans(n_clusters=k, random_state=42)
df_train_label['cluster'] = kmeans.fit_predict(X)


# Group by cluster and topic_id to see distribution
cluster_topic_distribution = df_train_label.groupby(['cluster', 'topic_id']).size().reset_index(name='count')

# Find the most common topic_id in each cluster
cluster_to_topic = cluster_topic_distribution.sort_values('count', ascending=False).drop_duplicates('cluster')

# Create a mapping from cluster to topic_id
cluster_topic_mapping = dict(zip(cluster_to_topic['cluster'], cluster_to_topic['topic_id']))

# Assign the matched topic_id back to the DataFrame
df_train_label['matched_topic_id'] = df_train_label['cluster'].map(cluster_topic_mapping)

# predict for df_test
df_test_label['fasttext_embedding'] = df_test_label['fasttext_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))
X = np.vstack(df_test_label['fasttext_embedding'].values)#.astype(np.float64)
df_test_label['cluster'] = kmeans.predict(X)
# map to the topic id
df_test_label['matched_topic_id'] = df_test_label['cluster'].map(cluster_topic_mapping)

In [57]:
train_accuracy = np.sum(df_train_label['matched_topic_id'] == df_train_label['topic_id'])/len(df_train_label)
test_accuracy = np.sum(df_test_label['matched_topic_id'] == df_test_label['topic_id'])/len(df_test_label)
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

train accuracy: 0.3141459017448973
test accuracy: 0.31496265773886173


**try fast text**

In [60]:
from gensim.models import FastText

fasttext_model = FastText(sentences=df_train_label['processed_text_nsc'], vector_size=300, window=5, min_count=5, workers=4, epochs=10)

In [62]:
fasttext_vectors = fasttext_model.wv

df_train_label["fasttext_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fasttext_vectors))
df_test_label["fasttext_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fasttext_vectors))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train_label["fasttext_embedding"] = df_train_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fasttext_vectors))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_label["fasttext_embedding"] = df_test_label["tokenized_text"].apply(lambda tokens: get_word2vec_embedding(tokens, fasttext_vectors))


In [64]:
train_accuracy = np.sum(df_train_label['matched_topic_id'] == df_train_label['topic_id'])/len(df_train_label)
test_accuracy = np.sum(df_test_label['matched_topic_id'] == df_test_label['topic_id'])/len(df_test_label)
print('train accuracy:', train_accuracy)
print('test accuracy:', test_accuracy)

train accuracy: 0.2061039211898783
test accuracy: 0.2142673190831831
