# Data Preprocessing

**detect language**

In [86]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0  # Ensure consistent results

# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'  # Handle cases where detection fails

# Apply language detection
df['detected_language'] = df['quote_text'].apply(detect_language)

# Compare detected language with the country
mismatched_language = df[(df['country_name'] == 'France') & (df['detected_language'] == 'en')]

# Display mismatched examples
print(mismatched_language[['quote_text', 'country_name', 'detected_language']].head())


                                              quote_text country_name  \
3694   A PORTER LA JOURNEE POUR TRAVAILLER , EN SOIRE...       France   
4749   L'OFFRE NE CORRESPOND PAS A UN LOT DE TROIS ET...       France   
5320                                        Tarif normal       France   
5343   Points faibles    -NE SÉPARE PAS LES CILS    -...       France   
10515        Packaging simple, list inkey plutôt propre.       France   

      detected_language  
3694                 en  
4749                 en  
5320                 en  
5343                 en  
10515                en  


In [89]:
mismatched_language.to_csv('mismatched_language.csv')

**Train test split**

In [13]:
import pandas as pd

#load the data
df = pd.read_parquet("dataset.parquet")

# do train-test split
# since the distribution of languages, labels and topics are quite uniform
# I just do simple train-test split for now
import pandas as pd
from sklearn.model_selection import train_test_split


# Perform train-test split on the full DataFrame
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Check dataset sizes
print(f"Training set size: {df_train.shape}")
print(f"Test set size: {df_test.shape}")


Training set size: (88911, 10)
Test set size: (22228, 10)


In [14]:
# check the distribution on each data set
print("Label distribution in training set:")
print(df_train['label'].value_counts(normalize=True))

print("\nLabel distribution in test set:")
print(df_test['label'].value_counts(normalize=True))

print("Language distribution in training set:")
print(df_train['country_name'].value_counts(normalize=True))

print("\nLanguage distribution in test set:")
print(df_test['country_name'].value_counts(normalize=True))

print("Topic distribution in training set:")
print(df_train['topic_id'].value_counts(normalize=True))

print("\nTopic distribution in test set:")
print(df_test['topic_id'].value_counts(normalize=True))

Label distribution in training set:
label
False    0.82532
True     0.17468
Name: proportion, dtype: float64

Label distribution in test set:
label
False    0.82531
True     0.17469
Name: proportion, dtype: float64
Language distribution in training set:
country_name
United Kingdom    0.510567
France            0.489433
Name: proportion, dtype: float64

Language distribution in test set:
country_name
United Kingdom    0.513227
France            0.486773
Name: proportion, dtype: float64
Topic distribution in training set:
topic_id
554.0    0.103215
602.0    0.103001
547.0    0.102765
544.0    0.102401
546.0    0.102144
550.0    0.102101
552.0    0.101930
543.0    0.101544
556.0    0.101330
600.0    0.079569
Name: proportion, dtype: float64

Topic distribution in test set:
topic_id
543.0    0.106677
552.0    0.105152
550.0    0.104474
546.0    0.104304
544.0    0.103288
547.0    0.101847
602.0    0.100915
554.0    0.100068
556.0    0.095238
600.0    0.078038
Name: proportion, dtype: float

**clean the text**

In [34]:
# check null and replace them with empty string
# and also strip the text
def clean_text(text):
    text = str(text).strip() if pd.notna(text) else ""  # Handle NaN & strip spaces
    return text

**correct wrong spelling**

In [None]:
!pip install pyspellchecker

In [17]:
from spellchecker import SpellChecker

# Load spell checkers for English and French
spell_en = SpellChecker(language="en")
spell_fr = SpellChecker(language="fr")

# function to correct spelling
def correct_tokens(tokens, spell):
    """
    Corrects spelling of word tokens while keeping punctuation untouched.
    """
    corrected_tokens = []
    for token in tokens:
        if token.isalpha():  # Only check spelling for words
            corrected_word = spell.correction(token)
            corrected_tokens.append(corrected_word if corrected_word else token)  # Keep original if no correction
        else:
            corrected_tokens.append(token)  # Leave punctuation untouched
    return corrected_tokens

# function to assign spellcheck according to country
def correct_spelling(text, country):
    spell = spell_fr if country == 'France' else spell_en  # Choose correct spell checker
    
    tokens = text.split()  # Tokenize text
    corrected_tokens = correct_tokens(tokens, spell)  # Apply spell checking
    return " ".join(corrected_tokens)  # Convert back to string

# Test with sample rows
sample_rows = df.sample(n=5, random_state=42)

for _, row in sample_rows.iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    corrected_text = correct_spelling(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"Corrected: {corrected_text}")
    print("-" * 50)  # Separator for readability


Country: France
Original : Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
Corrected: Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
--------------------------------------------------
Country: United Kingdom
Original : but i found that this one was SO much better!!
Corrected: but i found that this one was SO much better!!
--------------------------------------------------
Country: United Kingdom
Original : It makes it ooze as soon as you uncap it and it makes a mess everywhere.
Corrected: It makes it ooze as soon as you uncap it and it makes a mess everywhere.
--------------------------------------------------
Country: United Kingdom
Original : I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and gives the most gorgeous glow and pillow talk hue to the skin!!
Corrected: I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and give

**replace emoji with the corresponding meaning in each language**

In [None]:
!pip install emoji

In [22]:
import emoji

# Function to replace emoji with text based on country
def replace_emoji(text, country):
    lang = "fr" if country == "France" else "en"  # Choose French or English
    return emoji.demojize(text, language=lang).replace(":", "").replace("_", " ")  # Clean up output


Country: France
Original : - Flacon à la contenance généreuse.
With Emoji Text: - Flacon à la contenance généreuse.
--------------------------------------------------
Country: France
Original : Je conseille ce produit pour les peaux très sèches et fragiles.
With Emoji Text: Je conseille ce produit pour les peaux très sèches et fragiles.
--------------------------------------------------
Country: United Kingdom
Original : I have shaken the moisturiser vigorously before use, and it makes a clicking sound, suggesting there might be a small sphere inside for mixing.
With Emoji Text: I have shaken the moisturiser vigorously before use, and it makes a clicking sound, suggesting there might be a small sphere inside for mixing.
--------------------------------------------------
Country: United Kingdom
Original : I have no complaints about this product.
With Emoji Text: I have no complaints about this product.
--------------------------------------------------
Country: United Kingdom
Original :

In [23]:
# Function to check if a text contains emojis
def contains_emoji(text):
    text = str(text)  # Convert to string to avoid errors
    return any(char in emoji.EMOJI_DATA for char in text)

# Print rows containing emojis
emoji_rows = df[df['quote_text'].apply(contains_emoji)]
print(emoji_rows[['quote_text']])

# try to see if we have replace them
for _, row in emoji_rows.head().iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    replaced_text = replace_emoji(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"With Emoji Text: {replaced_text}")
    print("-" * 50)


                                               quote_text
22      I really loved the packaging of this product i...
108     The bottle is so pretty 😍 it's got a bit of a ...
233     🥰❤️It has a unique aroma, I love it, and it is...
445     but compared to my other selection of aftersha...
1287    From the intriguing  packaging topped with a r...
...                                                   ...
110763                       ❤️ 5 starts for me for sure!
110860  Brosse arrivée cassée, à la base de la brosse ...
110903                    Et notre coiffeuse en a aussi 🤩
110958                                Just as described 😎
110987                     very fast delivery excellent 😊

[1049 rows x 1 columns]
Country: United Kingdom
Original : I really loved the packaging of this product it's unique and beautiful 😍.
With Emoji Text: I really loved the packaging of this product it's unique and beautiful smiling face with heart-eyes.
--------------------------------------------------
C

**lemmatization**

In [None]:
!pip install spacy

In [32]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
!pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h

In [33]:
import spacy

# Load English & French models
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

# Function to lemmatize text based on country
def lemmatize_text(text, country):
    nlp = nlp_fr if country == "France" else nlp_en  # Choose model
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])  # Get lemmatized words

# Select a few random samples
sample_rows = df.sample(n=5, random_state=42)

for _, row in sample_rows.iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    lemmatized_text = lemmatize_text(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"Lemmatized: {lemmatized_text}")
    print("-" * 50)


Country: France
Original : Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
Lemmatized: idéal pour créer un variété de coiffure sans tirer ni abîmer le cheveu
--------------------------------------------------
Country: United Kingdom
Original : but i found that this one was SO much better!!
Lemmatized: but I find that this one be so much well ! !
--------------------------------------------------
Country: United Kingdom
Original : It makes it ooze as soon as you uncap it and it makes a mess everywhere.
Lemmatized: it make it ooze as soon as you uncap it and it make a mess everywhere .
--------------------------------------------------
Country: United Kingdom
Original : I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and gives the most gorgeous glow and pillow talk hue to the skin!!
Lemmatized: I be in love with the beauty light wand and this one be no different it ’ the perfect highlight shade and give the

**n-gram**

In [65]:
from itertools import islice

# Function to generate n-grams
def generate_ngrams(text, n=2):
    words = text.split()
    ngrams = [" ".join(words[i:i+n]) for i in range(len(words)-n+1)]
    return ngrams

# Function to expand text with n-grams
def add_ngrams(text, max_n=3):
    words = text.split()
    all_ngrams = words[:]  # Keep original unigrams
    
    for n in range(2, max_n+1):  # Generate bigrams, trigrams
        all_ngrams.extend([" ".join(words[i:i+n]) for i in range(len(words)-n+1)])

    return " ".join(all_ngrams)  # Join all n-grams into a single string




In [70]:
test_text = "This perfume smells amazing and lasts long"

print("Original Text:", test_text)

ngram_text = add_ngrams(test_text, max_n=3)
print("\n After N-Grams Processing:", ngram_text)


Original Text: This perfume smells amazing and lasts long

 After N-Grams Processing: This perfume smells amazing and lasts long This perfume perfume smells smells amazing amazing and and lasts lasts long This perfume smells perfume smells amazing smells amazing and amazing and lasts and lasts long


In [69]:
# Select a small random sample
df_sample = df.sample(n=5, random_state=42).copy()

# Apply pipeline on the sample (only process text, exclude embeddings for now)
df_sample['processed_text'] = df_sample.apply(
    lambda row: add_ngrams(row['quote_text'], 2), axis=1
)

# Display results
print(df_sample[[ 'processed_text']])


                                          processed_text
67236  Idéal pour créer une variété de coiffures sans...
76279  but i found that this one was SO much better!!...
8525   It makes it ooze as soon as you uncap it and i...
18197  I am in love with the beauty light wands and t...
74427  Je ne regrette pas du tout mon achat. Je ne ne...


**create a pipeline**

In [79]:
def process_text_pipeline(text, country):
    text = clean_text(text)
    text = correct_spelling(text, country)
    text = replace_emoji(text, country)
    text = lemmatize_text(text, country)
    text = add_ngrams(text, 2) # create bigram for now
    return text

In [80]:
processed_texts = []

# Iterate through the DataFrame rows
for index, row in df_train.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

# Assign lists back to the DataFrame
df_train['processed_text_nsc'] = processed_texts


In [81]:
# Repeat for df_test
processed_texts = []

for index, row in df_test.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

df_test['processed_text_nsc'] = processed_texts

In [82]:
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index = False)

**create a version without spellcheck, as spellcheck might result in brandname etc being replaced? should not be, as if it return None we keep the original ones.**

# some fine tunning attempt on data preprocessing

**out of vocabulary detection**

In [None]:
from spellchecker import SpellChecker
import pandas as pd

# Load spell checkers for English and French
spell_en = SpellChecker(language="en")
spell_fr = SpellChecker(language="fr")

# Dictionary to collect unrecognized words
unrecognized_words = []

# Function to correct tokens and collect unrecognized words
def correct_tokens(tokens, spell, original_text, country):
    corrected_tokens = []
    for token in tokens:
        if token.isalpha():  # Only check spelling for words
            corrected_word = spell.correction(token)
            if corrected_word is None:
                # Collect unrecognized token with original quote and country
                unrecognized_words.append({
                    'Unrecognized_Word': token,
                    'Original_Quote': original_text,
                    'Country': country
                })
            corrected_tokens.append(corrected_word if corrected_word else token)  # Keep original if no correction
        else:
            corrected_tokens.append(token)  # Leave punctuation untouched
    return corrected_tokens

# Function to assign spellchecker based on country
def correct_spelling(text, country):
    spell = spell_fr if country == 'France' else spell_en  # Choose correct spell checker
    tokens = text.split()  # Tokenize text
    corrected_tokens = correct_tokens(tokens, spell, text, country)  # Apply spell checking and collect unrecognized words
    return " ".join(corrected_tokens)  # Convert back to string

# Apply the correction function to the dataset (assuming 'df' is your DataFrame)
_ = df.apply(lambda row: correct_spelling(row['quote_text'], row['country_name']), axis=1)

# Convert unrecognized words into a DataFrame
unrecognized_df = pd.DataFrame(unrecognized_words)

# Display the first few unrecognized words
print(unrecognized_df.head())


In [None]:
# Function to detect short quotes (1 or 2 words)
def is_short_quote(text):
    return 1 <= len(text.strip().split()) <= 2

# Filter short quotes
short_quotes = df[df['quote_text'].apply(is_short_quote)]

# Display a few examples
print(short_quotes[['quote_text']].head())


In [None]:
unrecognized_df.to_csv('unrecognized_words.csv')
short_quotes.to_csv('short_quotes.csv')

# try different embedding

In [25]:
import pandas as pd
df_train = pd.read_csv('df_train.csv')
df_test = pd.read_csv('df_test.csv')

**Do Kmeans, where topic id are given by majority vote of clusters**

In [56]:
from sklearn.cluster import KMeans
import numpy as np

# Prepare SBERT embeddings
# Convert embeddings from lists to NumPy arrays
df_train['word2vec_embedding'] = df_train['word2vec_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))
# Stack embeddings properly
X = np.vstack(df_train['word2vec_embedding'].values)

# Apply K-Means clustering
k = 11  # Number of clusters, 10 topics add out of topic
kmeans = KMeans(n_clusters=k, random_state=42)
df_train['cluster'] = kmeans.fit_predict(X)

In [57]:
# Check cluster assignments
print(df_train[['quote_text', 'cluster', 'topic_id']].head())

# Group by cluster and topic_id to see distribution
cluster_topic_distribution = df_train.groupby(['cluster', 'topic_id']).size().reset_index(name='count')

# Find the most common topic_id in each cluster
cluster_to_topic = cluster_topic_distribution.sort_values('count', ascending=False).drop_duplicates('cluster')

# Create a mapping from cluster to topic_id
cluster_topic_mapping = dict(zip(cluster_to_topic['cluster'], cluster_to_topic['topic_id']))

# Assign the matched topic_id back to the DataFrame
df_train['matched_topic_id'] = df_train['cluster'].map(cluster_topic_mapping)

# Check the mapping
print(df_train[['quote_text', 'cluster', 'topic_id', 'matched_topic_id']].head())


                                          quote_text  cluster  topic_id
0  J'ai d'abord cru à une lotion nettoyante, hydr...        0     552.0
1  Je regrette juste que la recharge soit en plas...        0     554.0
2  The packaging comes in a cute sturdy box, and ...        8     602.0
3                                      Inadmissible.        6       NaN
4  J'ai une dermathose séboréique, et le shampooi...        0       NaN
                                          quote_text  cluster  topic_id  \
0  J'ai d'abord cru à une lotion nettoyante, hydr...        0     552.0   
1  Je regrette juste que la recharge soit en plas...        0     554.0   
2  The packaging comes in a cute sturdy box, and ...        8     602.0   
3                                      Inadmissible.        6       NaN   
4  J'ai une dermathose séboréique, et le shampooi...        0       NaN   

   matched_topic_id  
0             556.0  
1             556.0  
2             543.0  
3             602.0  
4      

## sbert

In [27]:
# Sentence-BERT (SBERT)
from sentence_transformers import SentenceTransformer

# Load pre-trained SBERT model
sbert_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

def get_sbert_embedding(text):
    """Generates sentence embedding using SBERT."""
    return sbert_model.encode(text)


  from .autonotebook import tqdm as notebook_tqdm


In [28]:
def embedding_text (text, function):
    return function(text)

In [None]:
embedded_texts = []

# Iterate through the DataFrame rows
for index, row in df_train.iterrows():
    embedded_text = embedding_text(row['processed_text'],get_sbert_embedding)
    embedded_texts.append(embedded_text)

# Assign lists back to the DataFrame
df_train['sbert_embedding'] = embedded_texts

In [29]:
embedded_texts = []

# Iterate through the DataFrame rows
for index, row in df_test.iterrows():
    embedded_text = embedding_text(row['processed_text'],get_sbert_embedding)
    embedded_texts.append(embedded_text)

# Assign lists back to the DataFrame
df_test['sbert_embedding'] = embedded_texts

In [46]:
print(df_test['sbert_embedding'])

0        [0.31978327, 0.14027788, 0.08224525, 0.0696379...
1        [0.093407236, 0.1766506, 0.046390157, 0.009621...
2        [0.5187371, 0.14435184, -0.37031856, -0.191911...
3        [0.14612241, 0.2545923, 0.3480164, 0.17786138,...
4        [-0.029019503, 0.034514595, -0.14274204, -0.01...
                               ...                        
22223    [0.08779375, 0.022377515, 0.18306349, 0.192810...
22224    [-0.11827949, 0.011398915, 0.10562416, 0.21336...
22225    [0.07385143, 0.12553537, 0.030955251, -0.06214...
22226    [0.058854192, 0.008282426, 0.09277359, -0.2619...
22227    [0.1882922, 0.1822469, 0.2885761, 0.09011534, ...
Name: sbert_embedding, Length: 22228, dtype: object


In [42]:
# Filter labeled data (with known topic_id)
labeled_data = df_train[df_train['topic_id'].notnull()]

# Define the conditions
true_positive = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
true_negative = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])
false_positive = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
false_negative = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])

# Calculate accuracy
correct_predictions = true_positive | true_negative
combined_accuracy = correct_predictions.mean()

# Display accuracy
print(f"Overall Accuracy (Topic Matching + Labels): {combined_accuracy:.2%}")

# Optional: Display counts for TP, TN, FP, FN
print(f"True Positive (TP): {true_positive.sum()}")
print(f"True Negative (TN): {true_negative.sum()}")
print(f"False Positive (FP): {false_positive.sum()}")
print(f"False Negative (FN): {false_negative.sum()}")


Overall Accuracy (Topic Matching + Labels): 57.20%
True Positive (TP): 2079
True Negative (TN): 24629
False Positive (FP): 6529
False Negative (FN): 13452


In [53]:
#df_test['sbert_embedding'] = df_test['sbert_embedding'].apply(lambda x: np.fromstring(x.strip("[]"), sep=" ") if isinstance(x, str) else np.array(x))

# Stack embeddings into a NumPy array
X = np.vstack(df_test['sbert_embedding'].values).astype(np.float64)

# Apply K-Means prediction
df_test['cluster'] = kmeans.predict(X)

df_test['matched_topic_id'] = df_test['cluster'].map(cluster_topic_mapping)

In [54]:
labeled_data = df_test[df_train['topic_id'].notnull()]

# Define the conditions
true_positive = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
true_negative = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])
false_positive = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
false_negative = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])

# Calculate accuracy
correct_predictions = true_positive | true_negative
combined_accuracy = correct_predictions.mean()

# Display accuracy
print(f"Overall Accuracy (Topic Matching + Labels): {combined_accuracy:.2%}")

# Optional: Display counts for TP, TN, FP, FN
print(f"True Positive (TP): {true_positive.sum()}")
print(f"True Negative (TN): {true_negative.sum()}")
print(f"False Positive (FP): {false_positive.sum()}")
print(f"False Negative (FN): {false_negative.sum()}")


Overall Accuracy (Topic Matching + Labels): 77.26%
True Positive (TP): 283
True Negative (TN): 8777
False Positive (FP): 897
False Negative (FN): 1769


  labeled_data = df_test[df_train['topic_id'].notnull()]


In [55]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

# Convert matching results into binary form
y_true = (df_test['label'] == True)  # True labels
y_pred = (df_test['matched_topic_id'] == df_test['topic_id'])  # Model's predictions

# Compute confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred) # TP/(TP+FP)
recall = recall_score(y_true, y_pred)  # This is TP / (TP + FN)
f1 = f1_score(y_true, y_pred)

# Display results
print(f"Confusion Matrix:\n TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall (Correct Topic Retrieval Rate): {recall:.2%}")
print(f"F1-Score: {f1:.2%}")


Confusion Matrix:
 TP: 544, FP: 1669, FN: 3339, TN: 16676
Accuracy: 77.47%
Precision: 24.58%
Recall (Correct Topic Retrieval Rate): 14.01%
F1-Score: 17.85%


In [23]:
df_train.to_csv('df_train.csv')

## bert

In [None]:
# BERT (Base Model + CLS Token)
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
bert_model = BertModel.from_pretrained('bert-base-multilingual-cased')

def get_bert_cls_embedding(text):
    """Generates sentence embedding using the CLS token from BERT."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token embedding

# Example usage
text = "This is a sample text."
bert_cls_embedding = get_bert_cls_embedding(text)

# Display the shape of the embedding
print("BERT CLS Token Embedding Shape:", bert_cls_embedding.shape)


## word2vec

In [3]:
! pip install smart_open



In [12]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [17]:
# Count NaN values in the processed_text column
nan_count = df_train['processed_text'].isna().sum()

print(f"Number of NaN values in 'processed_text': {nan_count}")

# Replace NaN with an empty string
df_train['processed_text'] = df_train['processed_text'].fillna('')


Number of NaN values in 'processed_text': 2


In [18]:
import spacy

# Load language models
nlp_en = spacy.load('en_core_web_sm')
nlp_fr = spacy.load('fr_core_news_sm')

# Tokenization function
def spacy_tokenize(text, lang='en'):
    nlp = nlp_fr if lang == 'fr' else nlp_en
    return [token.text for token in nlp(text)]

# Apply tokenization based on country_name
df_train['tokenized_text'] = df_train.apply(
    lambda row: spacy_tokenize(row['processed_text'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)

# Check tokenized output
print(df_train[['processed_text', 'tokenized_text']].head())


                                      processed_text  \
0  je avoir de abord croire à un lotion nettoyant...   
1  je regretter juste que le recharge être en pla...   
2  the packaging come in a cute sturdy box , and ...   
3                      Inadmissible . Inadmissible .   
4  je avoir un dermatose séboréiqu , et le shampo...   

                                      tokenized_text  
0  [je, avoir, de, abord, croire, à, un, lotion, ...  
1  [je, regretter, juste, que, le, recharge, être...  
2  [the, packaging, come, in, a, cute, sturdy, bo...  
3                 [Inadmissible, ., Inadmissible, .]  
4  [je, avoir, un, dermatose, séboréiqu, ,, et, l...  


In [19]:
# Word2Vec Embedding
import smart_open
from gensim.models import Word2Vec
import numpy as np

# Tokenize the processed text
#df_train['tokenized_text'] = df_train['processed_text'].apply(word_tokenize)

# Train the Word2Vec model
# from scratch
w2v_model = Word2Vec(sentences=df_train['tokenized_text'], vector_size=100, window=5, min_count=1, workers=4)

# Function to get the average Word2Vec embedding for each quote
def get_word2vec_embedding(tokens, model):
    # Filter out tokens not in the model's vocabulary
    valid_tokens = [token for token in tokens if token in model.wv]
    
    if valid_tokens:
        return np.mean(model.wv[valid_tokens], axis=0)
    else:
        # Return a zero vector if no valid tokens are found
        return np.zeros(model.vector_size)

# Apply the function to get embeddings
df_train['word2vec_embedding'] = df_train['tokenized_text'].apply(lambda tokens: get_word2vec_embedding(tokens, w2v_model))

# Display some embeddings
print(df_train[['processed_text', 'word2vec_embedding']].head())


                                      processed_text  \
0  je avoir de abord croire à un lotion nettoyant...   
1  je regretter juste que le recharge être en pla...   
2  the packaging come in a cute sturdy box , and ...   
3                      Inadmissible . Inadmissible .   
4  je avoir un dermatose séboréiqu , et le shampo...   

                                  word2vec_embedding  
0  [0.8978163, 0.121954896, 0.30679128, 1.1866511...  
1  [1.5041182, 0.74539626, 0.66749245, 0.8765296,...  
2  [-1.3043804, -1.1020962, -0.007926899, -0.8447...  
3  [0.5184143, -0.92831206, -0.88179547, -0.12716...  
4  [0.76718295, -0.15469556, 0.87029487, 1.191067...  


In [22]:
# Filter labeled data (with known topic_id)
labeled_data = df_train[df_train['topic_id'].notnull()]

# Define the conditions
true_positive = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
true_negative = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])
false_positive = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
false_negative = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])

# Calculate accuracy
correct_predictions = true_positive | true_negative
combined_accuracy = correct_predictions.mean()

# Display accuracy
print(f"Overall Accuracy (Topic Matching + Labels): {combined_accuracy:.2%}")

# Optional: Display counts for TP, TN, FP, FN
print(f"True Positive (TP): {true_positive.sum()}")
print(f"True Negative (TN): {true_negative.sum()}")
print(f"False Positive (FP): {false_positive.sum()}")
print(f"False Negative (FN): {false_negative.sum()}")


Overall Accuracy (Topic Matching + Labels): 59.53%
True Positive (TP): 2332
True Negative (TN): 25464
False Positive (FP): 5694
False Negative (FN): 13199


In [30]:
df_test['tokenized_text'] = df_test.apply(
    lambda row: spacy_tokenize(row['processed_text'], 'fr' if row['country_name'] == 'France' else 'en'),
    axis=1
)
df_test['word2vec_embedding'] = df_test['tokenized_text'].apply(lambda tokens: get_word2vec_embedding(tokens, w2v_model))


In [58]:
# predict for df_test
X = np.vstack(df_test['word2vec_embedding'].values)
df_test['cluster'] = kmeans.predict(X)
# map to the topic id
df_test['matched_topic_id'] = df_test['cluster'].map(cluster_topic_mapping)

In [59]:
# Filter labeled data (with known topic_id)
labeled_data = df_test[df_test['topic_id'].notnull()]

# Define the conditions
true_positive = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
true_negative = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])
false_positive = (labeled_data['label'] == False) & (labeled_data['matched_topic_id'] == labeled_data['topic_id'])
false_negative = (labeled_data['label'] == True) & (labeled_data['matched_topic_id'] != labeled_data['topic_id'])

# Calculate accuracy
correct_predictions = true_positive | true_negative
combined_accuracy = correct_predictions.mean()

# Display accuracy
print(f"Overall Accuracy (Topic Matching + Labels): {combined_accuracy:.2%}")

# Optional: Display counts for TP, TN, FP, FN
print(f"True Positive (TP): {true_positive.sum()}")
print(f"True Negative (TN): {true_negative.sum()}")
print(f"False Positive (FP): {false_positive.sum()}")
print(f"False Negative (FN): {false_negative.sum()}")

Overall Accuracy (Topic Matching + Labels): 59.96%
True Positive (TP): 608
True Negative (TN): 6469
False Positive (FP): 1450
False Negative (FN): 3275


In [60]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix

# Convert matching results into binary form
y_true = (df_test['label'] == True)  # True labels
y_pred = (df_test['matched_topic_id'] == df_test['topic_id'])  # Model's predictions

# Compute confusion matrix
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

# Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred) # TP/(TP+FP)
recall = recall_score(y_true, y_pred)  # This is TP / (TP + FN)
f1 = f1_score(y_true, y_pred)

# Display results
print(f"Confusion Matrix:\n TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f"Accuracy: {accuracy:.2%}")
print(f"Precision: {precision:.2%}")
print(f"Recall (Correct Topic Retrieval Rate): {recall:.2%}")
print(f"F1-Score: {f1:.2%}")


Confusion Matrix:
 TP: 608, FP: 1450, FN: 3275, TN: 16895
Accuracy: 78.74%
Precision: 29.54%
Recall (Correct Topic Retrieval Rate): 15.66%
F1-Score: 20.47%


**try different embedding, know the key structure and intuition behind!!**

In [70]:
import numpy as np

try:
    data = np.load('word2vec-google-news-300.model.vectors.npy', allow_pickle=True)
    print(data.shape)  # Check the shape of the array
except Exception as e:
    print(f"Error loading .npy file: {e}")


Error loading .npy file: cannot reshape array of size 696254432 into shape (3000000,300)


In [None]:
# Load pre-trained model (e.g., Google News vectors)
import gensim

# Load pre-trained Word2Vec model.
model = gensim.models.Word2Vec.load("word2vec-google-news-300.model")
#from gensim.models import KeyedVectors
#w2v_model_pretrained = KeyedVectors.load_word2vec_format('word2vec-google-news-300.bin', binary = True)

# Fine-tune on your data
w2v_model_pretrained.build_vocab(df_train['tokenized_text'], update=True)
w2v_model_pretrained.train(df_train['tokenized_text'], total_examples=w2v_model_pretrained.corpus_count, epochs=5)
