##Install Libraries, Load and Merge Data

In [17]:
!pip install deep_translator langdetect sentence-transformers pdfplumber easyocr pdf2image scikit-learn faiss-cpu



In [18]:
import pandas as pd

df_user = pd.read_csv("/content/HCK_HEC_USER.csv", dtype={"USER_ID": str})
df_xp = pd.read_csv("/content/HCK_HEC_XP.csv", dtype={"USER_ID": str, "MISSION_DSC": str})

df_xp_agg = df_xp.groupby("USER_ID")["MISSION_DSC"].agg(lambda x: "\n".join(x.dropna().unique())).reset_index()
df_merged = df_user[["USER_ID"]].merge(df_xp_agg, on="USER_ID", how="left")
df_merged["MISSION_DSC"] = df_merged["MISSION_DSC"].fillna("")


##Translation Logic

In [19]:
from deep_translator import GoogleTranslator
from nltk.tokenize import sent_tokenize
from concurrent.futures import ThreadPoolExecutor
import nltk

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

translator = GoogleTranslator(source='auto', target='en')

def force_split_text(text, max_len):
    return [text[i:i+max_len] for i in range(0, len(text), max_len)]

def translate_chunk(chunk):
    try:
        return translator.translate(chunk)
    except Exception:
        return chunk

def safe_translate_parallel(text, max_chunk_size=4500, max_workers=4):
    if not isinstance(text, str) or not text.strip():
        return ''

    sentences = sent_tokenize(text, language='french')
    chunks, current_chunk = [], ''
    for sentence in sentences:
        sentence = sentence.strip()
        if not sentence:
            continue
        if len(sentence) > max_chunk_size:
            chunks.extend(force_split_text(sentence, max_chunk_size))
            continue
        candidate = f"{current_chunk} {sentence}".strip()
        if len(candidate) > max_chunk_size:
            chunks.append(current_chunk)
            current_chunk = sentence
        else:
            current_chunk = candidate
    if current_chunk:
        chunks.append(current_chunk)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(executor.map(translate_chunk, chunks))
    return ' '.join(results)



In [20]:
df_merged['MISSION_DSC_ENGLISH'] = df_merged['MISSION_DSC'].apply(safe_translate_parallel)

##Preprocessing the Description Column

In [21]:
import re
import unicodedata

def preprocess_for_embedding(text):
    if not isinstance(text, str):
        return ''

    text = unicodedata.normalize('NFKC', text)

    # Replace non-breaking spaces and remove control characters (except \n)
    text = text.replace('\xa0', ' ')
    text = re.sub(r'[\x00-\x09\x0B-\x1F\x7F]', '', text)  # keep \n (\x0A)

    # Collapse multiple spaces (not newlines)
    text = re.sub(r'[ \t]+', ' ', text)

    # Strip trailing whitespace (but keep \n structure)
    return text.strip()


In [22]:
df_merged['MISSION_DSC_ENGLISH'] = df_merged['MISSION_DSC_ENGLISH'].apply(preprocess_for_embedding)

##Generating and Storing the Embeddings

In [23]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer


# Tokenize into sentences
df_merged['SENTENCES'] = df_merged['MISSION_DSC_ENGLISH'].apply(lambda x: sent_tokenize(x.strip()))

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to embed list of sentences
def embed_sentences(sent_list):
    if not sent_list:
        return []
    embs = model.encode(sent_list, convert_to_numpy=True)
    embs = embs / np.linalg.norm(embs, axis=1, keepdims=True)
    return embs.tolist()


In [24]:
df_merged['EMBEDDING'] = df_merged['SENTENCES'].apply(embed_sentences)

## Running all functions

## Save the Output for later use or replace this with a create table

In [25]:
df_merged.to_pickle("/content/mission_df_sentence_embeddings.pkl")
