**Train test split**

In [13]:
import pandas as pd

#load the data
df = pd.read_parquet("dataset.parquet")

# do train-test split
# since the distribution of languages, labels and topics are quite uniform
# I just do simple train-test split for now
from sklearn.model_selection import train_test_split


# Perform train-test split on the full DataFrame
df_train, df_test = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Check dataset sizes
print(f"Training set size: {df_train.shape}")
print(f"Test set size: {df_test.shape}")


Training set size: (88911, 10)
Test set size: (22228, 10)


In [14]:
# check the distribution on each data set
print("Label distribution in training set:")
print(df_train['label'].value_counts(normalize=True))

print("\nLabel distribution in test set:")
print(df_test['label'].value_counts(normalize=True))

print("Language distribution in training set:")
print(df_train['country_name'].value_counts(normalize=True))

print("\nLanguage distribution in test set:")
print(df_test['country_name'].value_counts(normalize=True))

print("Topic distribution in training set:")
print(df_train['topic_id'].value_counts(normalize=True))

print("\nTopic distribution in test set:")
print(df_test['topic_id'].value_counts(normalize=True))

Label distribution in training set:
label
False    0.82532
True     0.17468
Name: proportion, dtype: float64

Label distribution in test set:
label
False    0.82531
True     0.17469
Name: proportion, dtype: float64
Language distribution in training set:
country_name
United Kingdom    0.510567
France            0.489433
Name: proportion, dtype: float64

Language distribution in test set:
country_name
United Kingdom    0.513227
France            0.486773
Name: proportion, dtype: float64
Topic distribution in training set:
topic_id
554.0    0.103215
602.0    0.103001
547.0    0.102765
544.0    0.102401
546.0    0.102144
550.0    0.102101
552.0    0.101930
543.0    0.101544
556.0    0.101330
600.0    0.079569
Name: proportion, dtype: float64

Topic distribution in test set:
topic_id
543.0    0.106677
552.0    0.105152
550.0    0.104474
546.0    0.104304
544.0    0.103288
547.0    0.101847
602.0    0.100915
554.0    0.100068
556.0    0.095238
600.0    0.078038
Name: proportion, dtype: float

**clean the text**

In [34]:
# check null and replace them with empty string
# and also strip the text
def clean_text(text):
    text = str(text).strip() if pd.notna(text) else ""  # Handle NaN & strip spaces
    return text

**correct wrong spelling**

In [None]:
!pip install pyspellchecker

In [17]:
from spellchecker import SpellChecker

# Load spell checkers for English and French
spell_en = SpellChecker(language="en")
spell_fr = SpellChecker(language="fr")

# function to correct spelling
def correct_tokens(tokens, spell):
    """
    Corrects spelling of word tokens while keeping punctuation untouched.
    """
    corrected_tokens = []
    for token in tokens:
        if token.isalpha():  # Only check spelling for words
            corrected_word = spell.correction(token)
            corrected_tokens.append(corrected_word if corrected_word else token)  # Keep original if no correction
        else:
            corrected_tokens.append(token)  # Leave punctuation untouched
    return corrected_tokens

# function to assign spellcheck according to country
def correct_spelling(text, country):
    spell = spell_fr if country == 'France' else spell_en  # Choose correct spell checker
    
    tokens = text.split()  # Tokenize text
    corrected_tokens = correct_tokens(tokens, spell)  # Apply spell checking
    return " ".join(corrected_tokens)  # Convert back to string

# Test with sample rows
sample_rows = df.sample(n=5, random_state=42)

for _, row in sample_rows.iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    corrected_text = correct_spelling(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"Corrected: {corrected_text}")
    print("-" * 50)  # Separator for readability


Country: France
Original : Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
Corrected: Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
--------------------------------------------------
Country: United Kingdom
Original : but i found that this one was SO much better!!
Corrected: but i found that this one was SO much better!!
--------------------------------------------------
Country: United Kingdom
Original : It makes it ooze as soon as you uncap it and it makes a mess everywhere.
Corrected: It makes it ooze as soon as you uncap it and it makes a mess everywhere.
--------------------------------------------------
Country: United Kingdom
Original : I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and gives the most gorgeous glow and pillow talk hue to the skin!!
Corrected: I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and give

**replace emoji with the corresponding meaning in each language**

In [None]:
!pip install emoji

In [22]:
import emoji

# Function to replace emoji with text based on country
def replace_emoji(text, country):
    lang = "fr" if country == "France" else "en"  # Choose French or English
    return emoji.demojize(text, language=lang).replace(":", "").replace("_", " ")  # Clean up output


Country: France
Original : - Flacon à la contenance généreuse.
With Emoji Text: - Flacon à la contenance généreuse.
--------------------------------------------------
Country: France
Original : Je conseille ce produit pour les peaux très sèches et fragiles.
With Emoji Text: Je conseille ce produit pour les peaux très sèches et fragiles.
--------------------------------------------------
Country: United Kingdom
Original : I have shaken the moisturiser vigorously before use, and it makes a clicking sound, suggesting there might be a small sphere inside for mixing.
With Emoji Text: I have shaken the moisturiser vigorously before use, and it makes a clicking sound, suggesting there might be a small sphere inside for mixing.
--------------------------------------------------
Country: United Kingdom
Original : I have no complaints about this product.
With Emoji Text: I have no complaints about this product.
--------------------------------------------------
Country: United Kingdom
Original :

In [23]:
# Function to check if a text contains emojis
def contains_emoji(text):
    text = str(text)  # Convert to string to avoid errors
    return any(char in emoji.EMOJI_DATA for char in text)

# Print rows containing emojis
emoji_rows = df[df['quote_text'].apply(contains_emoji)]
print(emoji_rows[['quote_text']])

# try to see if we have replace them
for _, row in emoji_rows.head().iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    replaced_text = replace_emoji(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"With Emoji Text: {replaced_text}")
    print("-" * 50)


                                               quote_text
22      I really loved the packaging of this product i...
108     The bottle is so pretty 😍 it's got a bit of a ...
233     🥰❤️It has a unique aroma, I love it, and it is...
445     but compared to my other selection of aftersha...
1287    From the intriguing  packaging topped with a r...
...                                                   ...
110763                       ❤️ 5 starts for me for sure!
110860  Brosse arrivée cassée, à la base de la brosse ...
110903                    Et notre coiffeuse en a aussi 🤩
110958                                Just as described 😎
110987                     very fast delivery excellent 😊

[1049 rows x 1 columns]
Country: United Kingdom
Original : I really loved the packaging of this product it's unique and beautiful 😍.
With Emoji Text: I really loved the packaging of this product it's unique and beautiful smiling face with heart-eyes.
--------------------------------------------------
C

**lemmatization**

In [None]:
!pip install spacy

In [32]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
!pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m46.2 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h

In [33]:
import spacy

# Load English & French models
nlp_en = spacy.load("en_core_web_sm")
nlp_fr = spacy.load("fr_core_news_sm")

# Function to lemmatize text based on country
def lemmatize_text(text, country):
    nlp = nlp_fr if country == "France" else nlp_en  # Choose model
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])  # Get lemmatized words

# Select a few random samples
sample_rows = df.sample(n=5, random_state=42)

for _, row in sample_rows.iterrows():
    original_text = row['quote_text']
    country = row['country_name']
    lemmatized_text = lemmatize_text(original_text, country)

    print(f"Country: {country}")
    print(f"Original : {original_text}")
    print(f"Lemmatized: {lemmatized_text}")
    print("-" * 50)


Country: France
Original : Idéal pour créer une variété de coiffures sans tirer ni abîmer les cheveux
Lemmatized: idéal pour créer un variété de coiffure sans tirer ni abîmer le cheveu
--------------------------------------------------
Country: United Kingdom
Original : but i found that this one was SO much better!!
Lemmatized: but I find that this one be so much well ! !
--------------------------------------------------
Country: United Kingdom
Original : It makes it ooze as soon as you uncap it and it makes a mess everywhere.
Lemmatized: it make it ooze as soon as you uncap it and it make a mess everywhere .
--------------------------------------------------
Country: United Kingdom
Original : I am in love with the beauty light wands and this one is no different it’s the perfect highlight shade and gives the most gorgeous glow and pillow talk hue to the skin!!
Lemmatized: I be in love with the beauty light wand and this one be no different it ’ the perfect highlight shade and give the

**create a pipeline**

In [54]:
def process_text_pipeline(text, country):
    text = clean_text(text)
    text = correct_spelling(text, country)
    text = replace_emoji(text, country)
    text = lemmatize_text(text, country)
    return text

In [None]:
# Apply pipeline to df_train
df_train[['processed_text', 'embedding']] = df_train.apply(
    lambda row: process_text_pipeline(row['quote_text'], row['country_name']), axis=1, result_type="expand"
)

# Apply pipeline to df_test
df_test[['processed_text', 'embedding']] = df_test.apply(
    lambda row: process_text_pipeline(row['quote_text'], row['country_name']), axis=1, result_type="expand"
)

# Display sample results
print("Training set sample:")
print(df_train[['quote_text', 'processed_text', 'embedding']].head())

print("\nTest set sample:")
print(df_test[['quote_text', 'processed_text', 'embedding']].head())
