In [None]:
# imports
import pandas as pd
from langdetect import detect
import openai
import os
import time # Used to pause the API call function to avoid exceeding rate limit
import json
import math
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, \
    M2M100ForConditionalGeneration, M2M100Tokenizer

STEP BY STEP:
- Create pipeline comprising: data preprocess for desired number of rows and columns
- Creation of dataframe with reviews translated as rows:
    - Gold Standard column: call to OpenAI API gpt-3.5-turbo-0125 notebook
    - Helsinki model column: call to Helsinki OPUS-MT notebook
    - Facebook model column: call to Facebook NLLB-200-distilled-600M notebook
    - Facebook model column: call to Facebook M2M100-418M notebook
- Evaluation of models' performance:
    - Call to three different notebooks with three metrics.

Columns of non_en dataset: product_name, review_title, review_text, review_rating, language

In [2]:
# Step 1: Data Preprocessing
def preprocess_data(filename, cols):
    """
    Preprocesses the input data.
    
    Args:
    - filename (str): Path to the input CSV file.
    - cols (list of str): Selection of columns from the input data.
    
    Returns:
    - pd.DataFrame: non English dataframe.
    """

    df = pd.read_csv(filename, delimiter=',', index_col=None, header=0)
    df = df[cols].copy()

    # Apply detect function with exception handling
    def detect_language(text):
        try:
            return detect(text)
        except Exception:
            return None
    # Create df column using a langdetect library to detect language of input text
    df['language'] = df['review_text'].apply(lambda text : detect_language(text) if pd.notnull(text) else None)

    # For better understanding
    distinct_lang = df.groupby('language', as_index=False).count()
    print(f"Number of distinct languages in reduced dataframe: {len(distinct_lang['language'])}")
    print(distinct_lang['language'])
    
    # Separe english and non-english texts
    df_non_en = df[df['language'] != 'en']

    return df, df_non_en
    

In [51]:
# Step 2: Translation using OpenAI API gpt-3.5-turbo-0125 (our Gold Standard model)
def translate_openaigpt(reviews, langs):
    """
    Translates reviews using the OpenAI API gpt-3.5-turbo-0125 model.
    Makes use of subfunctions.

    Args:
    - reviews (list): List of reviews to translate.
    - langs (list): list of reviews' languages.
    
    Returns:
    - list: Translated reviews.
    """

    # Read API key file
    f = open('../APIopenAI.txt','r')
    api_key = f.read()

    # Function makes OpenAI API call
    def batch_translation(batch_texts):
        '''
        Params: 
            batch_texts is an array of texts (str) that need to be translated. Works as prompt.
            batch_size the number of texts contained in the array.
        Function:
            Make an openai API call with instructions to translate to English all text within the array.
        Returns: response array in JSON format.
        '''

        # General system instructions
        system_instructions = f"You will be provided with an array of texts. You have to translate to \
            English the full text. Reply with all full completions in JSON format. The output format \
            should follow the next conditions:  \
            JSON dictionary have as key translations and have as value another dictionary, this second \
            dictionary will have as key the <original text given by user> and as values the \
            <translated text you generated>. Output format example: <\'translations\': \
            <original text 1: translated text1, original text 2: translated text 2, ...>>"
            
        # Call API only for selected texts
        response = openai.OpenAI(api_key=api_key).chat.completions.create(
            model="gpt-3.5-turbo-0125",
            response_format={ "type": "json_object" },
            messages=[
                {"role": "system", "content": system_instructions},
                {"role": "user", "content": batch_texts}
            ],
            #max_tokens=128,  # Increase max_tokens to retrieve more than one token
            n=1,
            stop=None
        )
        print(f"HEY HEY this is the gpt response {response.choices[0].message.content}\n")
        # Response is in JSON format
        return response.choices[0].message.content, response.usage.prompt_tokens, response.usage.completion_tokens

    # Function creates a set of translated batches
    def review_translation(input_col):
        '''
        Main function. 
        Returns translated texts' list.
        '''
        tokens = 0
        # Call function with API call, returns an array of translated text
        trans_json, prompt_tokens, completion_tokens = batch_translation(str(reviews))
        trans_json = json.loads(trans_json)
        tokens += prompt_tokens
        tokens +=completion_tokens
        print(f"tokens used are now {tokens}\n")
        # Transform JSON dict to list of texts
        trans_text = list(trans_json['translations'].values())
        return trans_text

    # Main function call for columns to translate
    batch_set = review_translation(reviews)
    return batch_set

In [33]:
# Step 3: Translation using Helsinki model
def translate_helsinki(non_en_data):
    """
    Translates reviews using the Helsinki/OPUS-MT models.
    
    Args:
    - reviews (list): List of reviews to translate.
    
    Returns:
    - list: Translated reviews.
    """

    # Group by language the non English dataframe
    grouped_data = non_en_data.groupby(['language']).apply(lambda x: x.sort_values(['language'], ascending=True))
    grouped_data_counts = non_en_data.groupby(['language']).size().reset_index(name='counts')
    print(f"ordered by language {grouped_data.head(5)}")
    
    # Boolean function definition to select romance languages
    def languages_contain(language):
        languages = ['it', 'ca', 'rm', 'es', 'ro', 'gl', 'co', 'wa', 'pt', 'oc', 'an', 'id', 'fr', 'ht', 'roa', 'en']
        return language in languages

    # Initialize dictionary to store translated texts by language
    translated_texts_by_language = {}

    # Loop through all texts to be translated
    for i in range(len(grouped_data)):
        text = grouped_data['review_title'].iloc[i]
        language = grouped_data['language'].iloc[i]
        
        if languages_contain(language):
            model_checkpoint = 'Helsinki-NLP/opus-mt-roa-en'
        else:
            model_checkpoint = f'Helsinki-NLP/opus-mt-{language}-en'
        
        # Check if the model checkpoint is already loaded for the language
        if language not in translated_texts_by_language:
            # Load the translation model for this language using pipeline
            translator = pipeline("translation", model=model_checkpoint)
            translated_texts_by_language[language] = []
            print(f"model is {model_checkpoint}\n")

        # Translate the text using the loaded model
        translation = translator(text)
        translated_text = translation[0]['translation_text']
        translated_texts_by_language[language].append(translated_text)

        # If all texts in this language have been translated, delete the model from memory
        if len(translated_texts_by_language[language]) == grouped_data_counts['counts'].iloc[i]:
            del translator

    return translated_texts_by_language


In [None]:
# Step 4: Translation using Facebook Facebook M2M100-1.2B
def translate_facebook_m2m(reviews, langs):
    """
    Translates reviews using the Facebook model M2M100-1.2B.
    
    Args:
    - reviews (list): List of reviews to translate.
    
    Returns:
    - list: Translated reviews.
    """

    # Initialize lists to store translated texts
    translated_texts = []

    model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_1.2B")
    tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_1.2B")
    for i in range(len(reviews)):
        tokenizer.src_lang = langs[i]
        encoded_text = tokenizer(reviews[i], return_tensors="pt")
        generated_tokens = model.generate(**encoded_text, forced_bos_token_id=tokenizer.get_lang_id("en"))
        translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        # Append translated text to the list
        translated_texts.append(translated_text)

    return translated_texts
        

In [None]:
# Step 5: Translation using Facebook NLLB-200-distilled-600M
def translate_facebook_nllb(reviews):
    """
    Translates reviews using the Facebook model NLLB-200-distilled-600M.
    
    Args:
    - reviews (list): List of reviews to translate.
    
    Returns:
    - list: Translated reviews.
    """
    
    # Read the token from the JSON file
    with open("..\.huggingface\config.json", "r") as file:
        token_data = json.load(file)

    # Extract the token value
    huggingface_token = token_data["huggingface_token"]

    # Define the dictionary mapping languages to language codes (provisional)
    language_code_dict = {
        'en': 'eng_Latn',
        'it': 'ita_Latn',
        'es': 'spa_Latn',
        'fr': 'fra_Latn',
        'de': 'deu_Latn',
        'ja': 'jpn_Japn',
        'tr': 'tur_Latn',
        'pt': 'por_Latn'
    }

    def read_texts_and_language_codes(df):
        # Initialize lists to store texts and language codes
        texts = []
        language_codes = []
        
        # Iterate over rows of the dataframe
        for index, row in df.iterrows():
            # Append text to the list
            texts.append(row['review_title'])
            
            # Map language code to language code from dictionary and append to the list
            language_codes.append(language_code_dict[row['language']])
        
        return texts, language_codes
    
    texts, language_codes = read_texts_and_language_codes(df_non_en)
 
    # Definition of general variables for all
    # Define the model checkpoint
    model_checkpoint = "facebook/nllb-200-distilled-600M"

    # Initialize model
    model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, token=huggingface_token)

    # Initialize lists to store translated texts
    translated_texts = []

    # Iterate through the texts list
    i = 0 # Temporal iterator for accessing languages_codes list
    for text in texts:
        # Define source language from language codes list
        src_lang = str(language_codes[i])

        # Initialize tokenizer for input language
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, token=huggingface_token, src_lang=src_lang)

        # Tokenize the text
        inputs = tokenizer(text, return_tensors="pt")
        
        # Generate translation
        translated_tokens = model.generate(
            **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["eng_Latn"], max_length=30
        )
        
        # Decode translated tokens
        translated_text = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
        
        # Append translated text to the list
        translated_texts.append(translated_text)

    # Print translated texts
    for original, translated in zip(texts, translated_texts):
        print("Original Text:", original)
        print("Translated Text:", translated)
        print()

    return translated_texts

In [None]:
# Step 5: Evaluation of models' performance
def evaluate_performance(bleu, bertscore, geval):
    """
    Evaluates the performance of models using three metrics.
    
    Args:
    - metric1_result (float): Result of metric 1.
    - metric2_result (float): Result of metric 2.
    - metric3_result (float): Result of metric 3.
    
    Returns:
    - dict: Performance evaluation metrics.
    """
    # Your evaluation code here
    pass


In [52]:
%%time
# MAIN PIPELINE
# Input: raw csv dataset of shape (6823, 11)
# Steps: 
#       Create dataframe of shape (6823, 4)
#       Identify row language by creating new column
#       Translate rows using three methods adding new columns
#       Evaluate translations using three metrics
# Output: Evaluation metrics results for each method and metric


# STEP 1: Data Preprocessing
# Load dataset 
filename = './data/raw/amazon_uk_dataset.csv'
cols = ['product_name','review_title','review_text','review_rating']

complete_data, non_en_data = preprocess_data(filename, cols)
print(f"portion of English data is {len(complete_data)} wrt non English {len(non_en_data)}\n")
print(f"preprocessed non English data is {non_en_data.head(10)}\n")
to_translate_col = non_en_data['review_title']
lang_col = non_en_data['language']


# STEP 1.2: Batch creation for the next steps
num_batches = 27 # 3 times the RPM, as 3 is RPM
# Create 'batch_id' column using pd.cut
non_en_data['batch_id'] = pd.cut(non_en_data.index, bins=num_batches, labels=range(1, num_batches + 1))

# STEP 2: Translation using OpenAI GPT-3.5 model (Gold Standard)
goldstd_filename = './data/preprocessed/goldstd_data.csv'

# Check if Gold Standard column has been computed
if not os.path.exists(goldstd_filename):
    # Empty list to fill with OpenAIGPT translations
    translated_openaigpt = []

    for batch_num in range(1, num_batches + 1):
        # Select rows corresponding to the batch number
        df_aux = non_en_data[non_en_data['batch_id'] == batch_num]
        print(f"check new df {len(df_aux)}\n")
        # Apply translation function
        translated_batch = translate_openaigpt(df_aux['review_title'], df_aux['language'])
        print(f"translated batch is {translated_batch}\n")
        translated_openaigpt.extend(translated_batch)

    non_en_data['translated_openaigpt'] = translated_openaigpt
    non_en_data.to_csv(goldstd_filename, index=False)
    print(f"translated df saved!\n")
else:
    non_en_data = pd.read_csv(goldstd_filename, delimiter=',', index_col=None, header=0)

'''   
# STEP 3: Translation using Helsinki model
translated_helsinki = translate_helsinki(to_translate_col, lang_col)
non_en_data['translated_helsinki'] = translated_helsinki

# STEP 4: Translation using Facebook models
translated_fbnllb = translate_facebook_nllb(to_translate_col, lang_col)
non_en_data['translated_facebook_nllb'] = translated_facebook_nllb

# STEP 5: Evaluation of models' performance
# matrix ? 
evaluation_results = evaluate_performance(metric1_result, metric2_result, metric3_result)
'''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


check new df 111

HEY HEY this is the gpt response {
    "translations": {
        "PERFETTE!!": "PERFECT!!",
        "delusione": "Disappointment",
        "Molto belle": "Very beautiful",
        "Molto carine e comode": "Very cute and comfortable",
        "Bellissime....peccato per il numero": "Beautiful....pity about the size",
        "sehr schöner Schuh,": "Very nice shoe,",
        "Super leicht und bequem.": "Super light and comfortable.",
        "Schnelle Lieferung, tolle Ware": "Fast delivery, great product",
        "Schöne Schuhe": "Beautiful shoes",
        "la coincidencia del objeto real y el anunciado": "the coincidence of the real and the advertised object"
    }
}

tokens used are now 404

translated batch is ['PERFECT!!', 'Disappointment', 'Very beautiful', 'Very cute and comfortable', 'Beautiful....pity about the size', 'Very nice shoe,', 'Super light and comfortable.', 'Fast delivery, great product', 'Beautiful shoes', 'the coincidence of the real and the adverti

ValueError: Length of values (270) does not match length of index (2994)

In [65]:
len(translated_openaigpt)

270