In [1]:
!pip install transformers datasets torch
!pip install -q accelerate -U
!pip install datasets




#### Finetunig de GPT2

In [2]:
import pandas as pd
import numpy as np
import re
import os

In [3]:
from datasets import Dataset, DatasetDict

# Chemin vers votre dataset local
file_path = 'fine-tuning data.csv'

# Charger le dataset
df = pd.read_csv(file_path)

# Combiner la question et la réponse pour en faire une seule colonne
df['text'] = "Question: " + df['question'] + " Réponse: " + df['answer']

# Supprimer les colonnes inutiles
df = df[['text']]

# Créer un Dataset Huggingface à partir du dataframe
dataset = Dataset.from_pandas(df)

# Diviser le dataset en ensembles d'entraînement et de test (80% train, 20% test)
train_test_split = dataset.train_test_split(test_size=0.2)

# Accéder aux ensembles d'entraînement et de test
data_train = train_test_split['train']
data_test = train_test_split['test']

# Créer un DatasetDict pour stocker les ensembles
dataset_dict = DatasetDict({'train': data_train, 'test': data_test})

# Convertir les ensembles en DataFrames Pandas pour visualiser les données
train_df = data_train.to_pandas()
test_df = data_test.to_pandas()

# Visualiser les premières lignes de l'ensemble d'entraînement
print("Ensemble d'entraînement :")
print(train_df.head())

# Visualiser les premières lignes de l'ensemble de test
print("Ensemble de test :")
print(test_df.head())


Ensemble d'entraînement :
                                                text
0  Question: What should you do after rotating th...
1  Question: What should be done if corrosion is ...
2  Question: What should be done when re-connecti...
3  Question: What type of screw is item 11 in the...
4  Question: Can the cigar lighter socket be used...
Ensemble de test :
                                                text
0  Question: What should be done to the rear axle...
1  Question: What is the displacement of the Deut...
2  Question: What should be done annually or ever...
3  Question: What safety measures should be taken...
4  Question: How do you shift gears? Réponse: Shi...


In [4]:
# Écrire les données d'entraînement dans 'Q_A_train.txt'
with open('Q_A_train.txt', 'w') as file:
    for _, row in train_df.iterrows():
        text = row['text']
        file.write(f"{text}\n\n")

In [5]:
# Write the test data to 'Q_A_test.txt'
with open('Q_A_test.txt', 'w') as file:
    for row in data_test:
        text = row['text']
        file.write(f"{text}\n\n")

In [6]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments

In [7]:
def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

In [8]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [9]:
from transformers import Trainer, TrainingArguments
def train(train_file_path,model_name,
          output_dir,
          overwrite_output_dir,
          per_device_train_batch_size,
          num_train_epochs,
          save_steps):
  tokenizer = GPT2Tokenizer.from_pretrained(model_name)
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  tokenizer.save_pretrained(output_dir)

  model = GPT2LMHeadModel.from_pretrained(model_name)

  model.save_pretrained(output_dir)

  training_args = TrainingArguments(
          output_dir=output_dir,
          overwrite_output_dir=overwrite_output_dir,
          per_device_train_batch_size=per_device_train_batch_size,
          num_train_epochs=num_train_epochs,
      )

  trainer = Trainer(
          model=model,
          args=training_args,
          data_collator=data_collator,
          train_dataset=train_dataset,
  )

  trainer.train()
  trainer.save_model()
  #dakshi kamel kayn fl folder : chat_model

In [10]:
train_file_path = "Q_A_train.txt"
model_name = 'gpt2'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 60.0
save_steps = 50000
output_dir = 'Chat_Model/'

In [11]:
train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)



Step,Training Loss
500,2.0013
1000,1.5314
1500,1.2858
2000,1.0978
2500,0.9447
3000,0.8229
3500,0.7263
4000,0.645
4500,0.5801
5000,0.5275


In [12]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer

In [13]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Fonction pour charger le modèle
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    model.to("cuda")  # Assurez-vous que le modèle est sur le GPU si disponible
    return model

# Fonction pour charger le tokenizer
def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

# Fonction pour générer du texte
def generate_text(model_path, sequence, max_length):
    # Charger le modèle et le tokenizer
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)

    # Tokenizer la séquence d'entrée
    ids = tokenizer.encode(sequence, return_tensors='pt').to("cuda")  # Assurez-vous que les tensors sont sur le GPU

    # Générer le texte
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        top_k=50,
        top_p=0.95,
    )

    # Décoder les prédictions
    generated_text = tokenizer.decode(final_outputs[0], skip_special_tokens=True)

    # Extraire uniquement la réponse
    # Supposons que la réponse commence après le mot "Réponse:"
    answer_start = generated_text.find("Réponse:") + len("Réponse:")
    answer_text = generated_text[answer_start:].strip()

    # Extraire la réponse jusqu'à la fin ou jusqu'à la prochaine question
    next_question_index = answer_text.find("Question:")
    if next_question_index != -1:
        answer_text = answer_text[:next_question_index].strip()

    return answer_text

# Définir la séquence de question
sequence = "What could happen if the machine is operated or maintained improperly?"
max_len = 50
model_path = "Chat_Model"  # Chemin vers votre modèle fine-tuné

# Générer le texte
generated_text = generate_text(model_path, sequence, max_len)
print(generated_text)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Death or serious injury could result.


In [14]:
sequence = "Question: What is the purpose of this manual?"
max_len = 50
model_path = "Chat_Model"  
generated_text = generate_text(model_path, sequence, max_len)
print(generated_text)

To provide operation and maintenance information for the SW405K wheel loader.


In [15]:
sequence = "What is the phone number of SANY ?"
max_len = 50
model_path = "Chat_Model"  


generated_text = generate_text(model_path, sequence, max_len)
print(generated_text)


470-552-SANY (7269)


In [15]:
sequence = "Who should you send the completed form to?"
max_len = 50
model_path = "Chat_Model"  # Chemin vers votre modèle fine-tuné

# Générer le texte
generated_text = generate_text(model_path, sequence, max_len)
print(generated_text)


A SANY dealer should be contacted.


#### Helsinki from English to darija

In [16]:
from transformers import pipeline

# Create the pipeline and specify device=0 to use the first GPU
pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Now the model will run on GPU
result = pipe("Translate: Hello")
print(result)




[{'generated_text': 'ترجم: سلام'}]


In [17]:
# Test the model with some English text
text = "Hello, how are you today?"

# Generate translation from English to Darija (Moroccan Arabic)
translation = pipe(text)

# Print the translated text
print(translation[0]['generated_text'])

سلام, كيداير اليوم?


In [18]:
# Input Darija text
darija_text = "I am tired"

# Try to generate the English translation
translation = pipe(darija_text)

# Print the translated txt (if it works)
print(translation[0]['generated_text'])

عيّيت


### Merging of GPT2 and Helsinki from enlish to darija

In [19]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load both models and tokenizers
gpt2_model = GPT2LMHeadModel.from_pretrained("Chat_Model").to("cuda")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("Chat_Model")
translation_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Function to generate answers and translate them
# Function to generate answers and translate them
def handle_input(input_text):
    # Step 1: Generate the answer using GPT-2
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt').to("cuda")
    
    # Generate the answer (note: `stop_sequence` removed)
    output = gpt2_model.generate(
        input_ids, 
        max_length=50, 
        pad_token_id=gpt2_tokenizer.eos_token_id, 
        num_return_sequences=1, 
        early_stopping=True  # Enable early stopping to reduce over-generation
    )
    generated_answer = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer after "Réponse:" without including unintended extra questions
    answer_start = generated_answer.find("Réponse:") + len("Réponse:")
    answer_text = generated_answer[answer_start:].split("Question:")[0].strip()  # Ensure splitting at the next "Question:"

    # Step 2: Translate the original question and the generated answer into Darija
    translated_question = translation_pipe(input_text)[0]['generated_text']
    translated_answer = translation_pipe(answer_text)[0]['generated_text']
    
    # Return both the original answer and the translated version
    return {
        "answer": answer_text,
        "translated_question": translated_question,
        "translated_answer": translated_answer
    }


In [20]:
# Example usage:
user_input = "What is the purpose of this manual?"
result = handle_input(user_input)

print("Answer:", result["answer"])
print("Translated Question:", result["translated_question"])
print("Translated Answer:", result["translated_answer"])




Answer: To provide operation and maintenance information for the SW405K wheel loader.
Translated Question: شنو لهدف من هاد لكتيّاب?
Translated Answer: باش نوفرو المعلومات ديال التشغيل و الصيانة للوادر SW405K.


In [20]:
# Example usage:
user_input = "Who should you send the completed form to?"
result = handle_input(user_input)

print("Answer:", result["answer"])
print("Translated Question:", result["translated_question"])
print("Translated Answer:", result["translated_answer"])




Answer: SANY
Translated Question: علامن خسّك توسّل الاستمارة كاملة?
Translated Answer: ساني


In [22]:
# Example usage:
user_input = "What is the phone number of SANY ?"
result = handle_input(user_input)

print("Answer:", result["answer"])
print("Translated Question:", result["translated_question"])
print("Translated Answer:", result["translated_answer"])


Answer: 470-552-SANY (7269)
Translated Question: شنو نمرة ديال ساني?
Translated Answer: 470-552-ساني (7269)


### Helsinki from Darija to english

In [21]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2")

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [22]:
# Translate the word "salam" from Darija to English
translation = pipe("salam")

# Output the result
print(translation[0]['translation_text'])


hello


In [25]:
# Translate the word "salam" from Darija to English
translation = pipe("سلام كيداير لباس عليك")

# Output the result
print(translation[0]['translation_text'])



hey how are you


In [23]:
# Translate the word "salam" from Darija to English
translation = pipe("شنو لهدف من هاد لكتاب" )

# Output the result
print(translation[0]['translation_text'])

what is the purpose of this book


### Final Merging of all the models

In [24]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load models and tokenizers
gpt2_model = GPT2LMHeadModel.from_pretrained("Chat_Model").to("cuda")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("Chat_Model")

# Pipeline for Darija to English translation
darija_to_english_pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2", device=0)

# Pipeline for English to Darija translation
english_to_darija_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Function to handle input in Darija
def handle_input_darija(input_text):
    # Step 1: Translate the input question from Darija to English
    translated_question = darija_to_english_pipe(input_text)[0]['translation_text']
    
    # Step 2: Generate the answer using GPT-2
    input_ids = gpt2_tokenizer.encode(f"Question: {translated_question}", return_tensors='pt').to("cuda")
    
    output = gpt2_model.generate(
        input_ids, 
        max_length=50, 
        pad_token_id=gpt2_tokenizer.eos_token_id, 
        num_return_sequences=1, 
        early_stopping=True
    )
    generated_answer = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer after "Réponse:" and clean extra questions
    answer_start = generated_answer.find("Réponse:") + len("Réponse:")
    answer_text = generated_answer[answer_start:].split("Question:")[0].strip()

    # Step 3: Translate the generated answer from English back to Darija
    translated_answer = english_to_darija_pipe(answer_text)[0]['generated_text']
    
    # Return the original question and translated answer
    return {
        "original_question_in_darija": input_text,
        "translated_question_in_english": translated_question,
        "generated_answer_in_english": answer_text,
        "translated_answer_in_darija": translated_answer
    }



In [25]:
# Example usage
darija_input = "شنو الهدف من هاد لكتاب؟"  # "What is the purpose of this manual?" in Darija
result = handle_input_darija(darija_input)

print("Original Question (Darija):", result["original_question_in_darija"])
print("Translated Question (English):", result["translated_question_in_english"])
print("Generated Answer (English):", result["generated_answer_in_english"])
print("Translated Answer (Darija):", result["translated_answer_in_darija"])


Original Question (Darija): شنو الهدف من هاد لكتاب؟
Translated Question (English): What's the goal of a book?
Generated Answer (English): To provide an overview of the subject matter and set the foundation for new information.
Translated Answer (Darija): باش تعطي لمحات على الموضوع وترسي الأساس لماعلومات الجديدة.


In [29]:
# Example usage
darija_input = "شنو رقم الهاتف ديال ساني؟"  # "What is the purpose of this manual?" in Darija
result = handle_input_darija(darija_input)

print("Original Question (Darija):", result["original_question_in_darija"])
print("Translated Question (English):", result["translated_question_in_english"])
print("Generated Answer (English):", result["generated_answer_in_english"])
print("Translated Answer (Darija):", result["translated_answer_in_darija"])

Original Question (Darija): شنو رقم الهاتف ديال ساني؟
Translated Question (English): What's the phone number for Sanne?
Generated Answer (English): 470-552-SANY (7269)
Translated Answer (Darija): 470-552-ساني (7269)


### USER INTERFACE

In [None]:
pip install ipywidgets


In [1]:
import ipywidgets as widgets
from IPython.display import display
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline

# Load models and tokenizers
gpt2_model = GPT2LMHeadModel.from_pretrained("Chat_Model").to("cuda")
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("Chat_Model")

# Pipeline for Darija to English translation
darija_to_english_pipe = pipeline("translation", model="lachkarsalim/LatinDarija_English-v2", device=0)

# Pipeline for English to Darija translation
english_to_darija_pipe = pipeline("text2text-generation", model="lachkarsalim/Helsinki-translation-English_Moroccan-Arabic", device=0)

# Function to handle input in Darija
def handle_input_darija(input_text):
    # Translate the input question from Darija to English
    translated_question = darija_to_english_pipe(input_text)[0]['translation_text']

    # Generate the answer using GPT-2
    input_ids = gpt2_tokenizer.encode(f"Question: {translated_question}", return_tensors='pt').to("cuda")
    output = gpt2_model.generate(
        input_ids,
        max_length=50,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        num_return_sequences=1,
        early_stopping=True
    )
    generated_answer = gpt2_tokenizer.decode(output[0], skip_special_tokens=True)

    # Extract the answer after "Réponse:"
    answer_start = generated_answer.find("Réponse:") + len("Réponse:")
    answer_text = generated_answer[answer_start:].split("Question:")[0].strip()

    # Translate the generated answer from English back to Darija
    translated_answer = english_to_darija_pipe(answer_text)[0]['generated_text']

    return translated_answer

# Create a text input box for Darija question
input_box = widgets.Text(
    value='',
    placeholder='Enter your question in Darija',
    description='Darija Question:',
    disabled=False
)

# Create an output display
output_box = widgets.Output()

# Function to process the input when the button is clicked
def on_button_click(b):
    with output_box:
        output_box.clear_output()  # Clear the previous output
        darija_question = input_box.value
        if darija_question:
            with output_box:
                print("Processing... Please wait.")
            answer = handle_input_darija(darija_question)
            with output_box:
                output_box.clear_output()
                print(f"Answer in Darija: {answer}")

# Create a button to trigger the processing
button = widgets.Button(
    description='Submit',
    disabled=False,
    button_style='success',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Submit your question',
    icon='check'
)

# Attach the button click event to the processing function
button.on_click(on_button_click)

# Display the input box, button, and output
display(input_box)
display(button)
display(output_box)




Text(value='', description='Darija Question:', placeholder='Enter your question in Darija')

Button(button_style='success', description='Submit', icon='check', style=ButtonStyle(), tooltip='Submit your q…

Output()