In [11]:
import importlib
import data_preprocessing  # Make sure it's already imported
importlib.reload(data_preprocessing)  # Force reload

<module 'data_preprocessing' from '/home/onyxia/work/data_preprocessing.py'>

In [None]:
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
!pip install https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl


In [12]:
import pandas as pd
import numpy as np
import data_preprocessing
from data_preprocessing import clean_text, correct_spelling, replace_emoji

In [13]:
from sklearn.model_selection import train_test_split

df = pd.read_parquet("dataset.parquet")
df_train, df_test = train_test_split(df, test_size=0.2,random_state=42)

In [14]:
# data preprocessing, skip lemmanization for sbert
def process_text_pipeline(text, country):
    text = clean_text(text)
    text = correct_spelling(text, country)
    text = replace_emoji(text, country)
    return text

# process for df_train
processed_texts = []
# Iterate through the DataFrame rows
for index, row in df_train.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

# Assign lists back to the DataFrame
df_train['processed_text'] = processed_texts

# Repeat for df_test
processed_texts = []

for index, row in df_test.iterrows():
    processed_text = process_text_pipeline(row['quote_text'], row['country_name'])
    processed_texts.append(processed_text)

df_test['processed_text'] = processed_texts

In [17]:
!pip install sentence_transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.6/766.6 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl (363.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hDownloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [3

In [18]:
# Sentence-BERT (SBERT)
from sentence_transformers import SentenceTransformer
# Load pre-trained SBERT model
sbert_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

def get_sbert_embedding(text):
    """Generates sentence embedding using SBERT."""
    return sbert_model.encode(text)

# get embedding for training set
embedded_texts = []

# Iterate through the DataFrame rows
for index, row in df_train.iterrows():
    embedded_text = get_sbert_embedding(row['processed_text'])
    embedded_texts.append(embedded_text)

# Assign lists back to the DataFrame
df_train['sbert_embedding'] = embedded_texts

# for test set
embedded_texts = []

# Iterate through the DataFrame rows
for index, row in df_test.iterrows():
    embedded_text = get_sbert_embedding(row['processed_text'])
    embedded_texts.append(embedded_text)

# Assign lists back to the DataFrame
df_test['sbert_embedding'] = embedded_texts

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
df_train.to_csv('df_train_sbert.csv')
df_test.to_csv('df_test_sbert.csv')