In [1]:
import pandas as pd
from tqdm import tqdm
file_path = 'questions.xlsx'

In [2]:
# !pip install transformers==4.49.0

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
torch.random.manual_seed(0)

model_path = "microsoft/Phi-4-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)

2025-06-04 11:38:14.703994: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Arabic

In [4]:
sheet_name = 'arabic'  # or the exact name of the sheet

# Read the specific sheet
df_a = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')

# Print the first few rows
print(df_a.head())

   id                         arabic
0   1               كيف تتشكل السحب؟
1   2        كيف انقرضت الديناصورات؟
2   3               ما هو الديناصور؟
3   4  لماذا تختلف ألوان عيون البشر؟
4   5     ما الفرق بين الذئب والكلب؟


In [5]:
df_a = df_a.drop(columns=['id']).rename(columns={"arabic": "text"})

In [6]:
# Define the generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# Generation arguments
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

# Function to generate answer for each question
def generate_answer(q):
    messages = [
        {"role": "system", "content": "أجب عن السؤال التالي في خمس إلى ست جمل. أجب على السؤال باللغة العربية."},
        {"role": "user", "content": q},
    ]
    try:
        output = pipe(messages, **generation_args)
        return output[0]['generated_text']
    except Exception as e:
        return f"Error: {e}"

# Run the generation 3 times
for i in range(1, 4):
    df_copy = df_a.copy()
    tqdm.pandas(desc=f"Generating phi4mini_arabic_{i}")
    df_copy['classify'] = df_copy['text'].progress_apply(generate_answer)
    df_copy.to_csv(f"phi4mini_arabic_{i}.csv", index=False, encoding='utf-8-sig')

Device set to use cuda:0
Generating phi4mini_arabic_1:  28%|██▊       | 11/40 [00:52<01:53,  3.93s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Generating phi4mini_arabic_1: 100%|██████████| 40/40 [04:24<00:00,  6.60s/it]
Generating phi4mini_arabic_2: 100%|██████████| 40/40 [04:23<00:00,  6.59s/it]
Generating phi4mini_arabic_3: 100%|██████████| 40/40 [04:27<00:00,  6.68s/it]


# English

In [7]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [8]:
sheet_name = 'english'  # or the exact name of the sheet

# Read the specific sheet
df_e = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')

# Print the first few rows
print(df_e.head())

   id                                           english
0   1                            How are clouds formed?
1   2             How did the dinosaurs become extinct?
2   3                               What is a dinosaur?
3   4          Why do humans have different eye colors?
4   5  What is the difference between a wolf and a dog?


In [9]:
df_e = df_e.drop(columns=['id']).rename(columns={"english": "text"})

In [10]:
# Define the generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# Generation arguments
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

# Function to generate answer for each question
def generate_answer(q):
    messages = [
        {"role": "system", "content": "Answer the following question in five to six sentences."},
        {"role": "user", "content": q},
    ]
    try:
        output = pipe(messages, **generation_args)
        return output[0]['generated_text']
    except Exception as e:
        return f"Error: {e}"

# Run the generation 3 times with different output files
for i in range(1, 4):
    df_copy = df_e.copy()
    tqdm.pandas(desc=f"Generating phi4mini_arabic_{i}")
    df_copy['classify'] = df_copy['text'].progress_apply(generate_answer)
    df_copy.to_csv(f"phi4mini_english_{i}.csv", index=False, encoding='utf-8-sig')

Device set to use cuda:0
Generating phi4mini_arabic_1: 100%|██████████| 40/40 [02:25<00:00,  3.63s/it]
Generating phi4mini_arabic_2: 100%|██████████| 40/40 [02:25<00:00,  3.63s/it]
Generating phi4mini_arabic_3: 100%|██████████| 40/40 [02:24<00:00,  3.61s/it]


# Hebrew

In [15]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [12]:
sheet_name = 'hebrew'  # or the exact name of the sheet

# Read the specific sheet
df_h = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')

# Print the first few rows
print(df_h.head())

   id                             hebrew
0   1                  איך נוצרים עננים?
1   2              איך הדינזוארים נכחדו?
2   3                    מה זה דינוזאור?
3   4  למה יש לבני אדם צבעי עיניים שונים
4   5            מה ההבדל בין זאב לכלב ?


In [13]:
df_h = df_h.drop(columns=['id']).rename(columns={"hebrew": "text"})

In [14]:
# Define the generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# Generation arguments
generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
}

# Function to generate answer for each question
def generate_answer(q):
    messages = [
        {"role": "system", "content": "ענה על השאלה הבאה בחמישה עד שישה משפטים."},
        {"role": "user", "content": q},
    ]
    try:
        output = pipe(messages, **generation_args)
        return output[0]['generated_text']
    except Exception as e:
        return f"Error: {e}"

# Run the generation 3 times with different output files
for i in range(1, 4):
    df_copy = df_h.copy()
    tqdm.pandas(desc=f"Generating phi4mini_arabic_{i}")
    df_copy['classify'] = df_copy['text'].progress_apply(generate_answer)
    df_copy.to_csv(f"phi4mini_hebrew_{i}.csv", index=False, encoding='utf-8-sig')

Device set to use cuda:0
Generating phi4mini_arabic_1: 100%|██████████| 40/40 [07:32<00:00, 11.30s/it]
Generating phi4mini_arabic_2: 100%|██████████| 40/40 [07:37<00:00, 11.44s/it]
Generating phi4mini_arabic_3: 100%|██████████| 40/40 [07:42<00:00, 11.56s/it]
