## 1- Import libraries:

In [1]:
import pandas as pd
import numpy as np
import json

## 2- Handle french dataframes:

In [2]:
# Rename this list to be clear it holds tables, not raw text rows
dataframes_list = []

# --- Load French Data ---
try:
    df_parquet = pd.read_parquet('../data/raw/fr_train.parquet', engine='pyarrow')
    print("Parquet Columns:", df_parquet.columns)

    # 1. Extract sentence1 and sentence2
    part1 = df_parquet[['sentence1']].rename(columns={'sentence1': 'text'})
    part2 = df_parquet[['sentence2']].rename(columns={'sentence2': 'text'})

    # 2. Combine them
    df_clean_parquet = pd.concat([part1, part2], ignore_index=True)
    df_clean_parquet=df_clean_parquet[:50000]  # Limit to first 50,000 rows
    # 3. CRITICAL FIX: Label this as 'Français' (since filename is fr_train)
    df_clean_parquet['language'] = 'Français'

    # 4. Drop empty rows
    df_clean_parquet.dropna(subset=['text'], inplace=True)

    # 5. Append the DATAFRAME to the list
    dataframes_list.append(df_clean_parquet)
    print(f"French Parquet loaded. Extracted {len(df_clean_parquet)} sentences.")

except Exception as e:
    print(f"Error loading Parquet: {e}")

french_data = []
try:
    with open('../data/raw/fr.txt', 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= 50_000:
                break
            if line.strip():
                french_data.append({'text': line.strip(), 'language': 'Français'})
    
    df_txt = pd.DataFrame(french_data)
    dataframes_list.append(df_txt)
    print(f"French Text loaded. Extracted {len(df_txt)} sentences.")
    print("Sample French Data:\n", df_txt.tail())
except Exception as e:
    print(f"Error loading TXT: {e}")

Parquet Columns: Index(['sentence1', 'sentence2', 'score', 'dataset'], dtype='object')
French Parquet loaded. Extracted 24454 sentences.
French Text loaded. Extracted 50000 sentences.
Sample French Data:
                                                     text  language
49995     Comment vous écrivez sueur en langue chaouie ?  Français
49996     Comment il doit écrire mot en langue hybride ?  Français
49997  Je me suis perdu parce que je n'avais pas de c...  Français
49998                 Elle n'a pas déménagé à Guerrouma.  Français
49999                             Tom est prêt pour toi.  Français


## 3-Handle english dataframes:

In [3]:
# --- Load English Data ---
english_data = []
try:
    with open('../data/raw/en.txt', 'r', encoding='utf-8') as f:
        for i,line in enumerate(f):
            if i >= 50_000:
                break
            if line.strip():
                english_data.append({'text': line.strip(), 'language': 'English'})
    
    df_txt = pd.DataFrame(english_data)
    dataframes_list.append(df_txt)
    print(f"English Text loaded. Extracted {len(df_txt)} sentences.")
    print("Sample English Data:\n", df_txt.tail())
except Exception as e:
    print(f"Error loading TXT: {e}")

jsonl_data = []

try:
    with open('../data/raw/train_en.jsonl', 'r', encoding='utf-8') as f:
        for i,line in f:
            if i >= 50_000:
                break
            if line.strip(): # Check if line is not empty
                # 1. Parse the JSON line into a Python dictionary
                json_obj = json.loads(line)
                
                # 2. Extract the text. 
                if 'sentence' in json_obj:
                    text_value = json_obj['sentence']
                    
                    # 3. Append to your list
                    jsonl_data.append({'text': text_value, 'language': 'English'})
    
    # 4. Convert to DataFrame and append to your main list
    df_jsonl = pd.DataFrame(jsonl_data)
    dataframes_list.append(df_jsonl)
    
    print(f"JSONL loaded. Extracted {len(df_jsonl)} sentences.")
    print("Sample JSONL Data:\n", df_jsonl.tail())

except Exception as e:
    print(f"Error loading JSONL: {e}")

English Text loaded. Extracted 50000 sentences.
Sample English Data:
                                                     text language
49995                          Yanni had a Berber class.  English
49996  Skura never encouraged or acknowledged Yanni's...  English
49997          I posted this picture on my social media.  English
49998                          "Can I have your number?"  English
49999                                             "Why?"  English
Error loading JSONL: too many values to unpack (expected 2)


## 4- Handle Darija dataframes: 

In [4]:
# List to hold all Darija data chunks
darija_chunks = []

# ---------------------------------------------------------
# 1. Load 'darija_sentences.csv'
# ---------------------------------------------------------
try:
    # Assuming this file has no header and is just lines of text
    # header=None ensures we don't lose the first sentence
    # names=['text'] names the column immediately
    df1 = pd.read_csv('../data/raw/darija_sentences.csv', nrows=50000, header=None, names=['text'], encoding='utf-8')
    df1['language'] = 'Darija'
    darija_chunks.append(df1)
    print(f"File 1 loaded: {len(df1)} rows.")
except Exception as e:
    print(f"Error loading File 1: {e}")

# ---------------------------------------------------------
# 2. Load 'sentences_darija2.csv' (FIXED LOGIC)
# ---------------------------------------------------------
try:
    df2 = pd.read_csv('../data/raw/sentences_darija2.csv', nrows=50000)
    
    # STRATEGY: We don't know the column name, so we grab the first one.
    first_col = df2.columns[0] 
    df2 = df2.rename(columns={first_col: 'text'})
    
    # Clean and Label
    df2['language'] = 'Darija'
    df2 = df2[['text', 'language']] # Keep only relevant columns
    
    darija_chunks.append(df2)
    print(f"File 2 loaded: {len(df2)} rows.")
except Exception as e:
    print(f"Error loading File 2: {e}")

# ---------------------------------------------------------
# 3. Load 'train_darija.csv'
# ---------------------------------------------------------
try:
    df3 = pd.read_csv('../data/raw/train_darija.csv', nrows=50000)
    
    if "sentence" in df3.columns:
        df3 = df3.rename(columns={'sentence': 'text'})
        df3['language'] = 'Darija'
        df3 = df3[['text', 'language']]
        darija_chunks.append(df3)
        print(f"File 3 loaded: {len(df3)} rows.")
    else:
        print("File 3 skipped: Column 'sentence' not found.")
except Exception as e:
    print(f"Error loading File 3: {e}")

# ---------------------------------------------------------
# 4. Load Parquet
# ---------------------------------------------------------
try:
    df_parquet = pd.read_parquet('../data/raw/darija_sentences_3.parquet', engine='pyarrow')
    
    # Select and Rename
    if 'Arabizi' in df_parquet.columns:
        df_parquet = df_parquet[['Arabizi']].rename(columns={'Arabizi': 'text'})
        df_parquet['language'] = 'Darija'
        df_parquet = df_parquet.head(50000) # Limit to 50k
        
        darija_chunks.append(df_parquet)
        print(f"Parquet loaded: {len(df_parquet)} rows.")
    else:
        print("Parquet skipped: Column 'Arabizi' not found.")
except Exception as e:
    print(f"Error loading Parquet: {e}")

# ---------------------------------------------------------
# FINAL MERGE
# ---------------------------------------------------------
if darija_chunks:
    # Combine all the chunks into one Darija DataFrame
    df_final_darija = pd.concat(darija_chunks, ignore_index=True)
    
    # Drop any empty rows just in case
    df_final_darija.dropna(subset=['text'], inplace=True)
    
    # Append to your MAIN list (that holds English/French too)
    dataframes_list.append(df_final_darija)
    
    print("\n--------------------------------------")
    print(f"Total Darija Loaded: {len(df_final_darija)} sentences")
    print("Sample:\n", df_final_darija.tail())
else:
    print("No Darija data was loaded.")

File 1 loaded: 48840 rows.
File 2 loaded: 10020 rows.
File 3 loaded: 50000 rows.
Parquet loaded: 850 rows.

--------------------------------------
Total Darija Loaded: 109710 sentences
Sample:
                                                      text language
109705  hir thenna, 3amrak t9dar t9na3ni anna Raja 7se...   Darija
109706  hahaha rak ma3arf walo flkora, tatkon l3bti 9e...   Darija
109707  mafkertich tdir chi mchro3 asahbi, malk m3gaz,...   Darija
109708  mcha l sou9 it9eda, fach irje3 ghadi ngolha li...   Darija
109709  nti hi goliha liha o machi choghlek f ach ghad...   Darija


In [5]:
# --- FINAL MERGE ---
if dataframes_list:
    df = pd.concat(dataframes_list, ignore_index=True)
    print("\nFinal Dataset Shape:", df.shape)
    print(df.tail())
    print(df["language"].unique())
else:
    print("No data loaded.")

# Francais : 600 000 sentences
# English : 1 850 000 sentences
# Darija : 1 300 000 sentences
# total=2 500 000 sentences



Final Dataset Shape: (234164, 2)
                                                     text language
234159  hir thenna, 3amrak t9dar t9na3ni anna Raja 7se...   Darija
234160  hahaha rak ma3arf walo flkora, tatkon l3bti 9e...   Darija
234161  mafkertich tdir chi mchro3 asahbi, malk m3gaz,...   Darija
234162  mcha l sou9 it9eda, fach irje3 ghadi ngolha li...   Darija
234163  nti hi goliha liha o machi choghlek f ach ghad...   Darija
['Français' 'English' 'Darija']


## Convert dataset to csv file:

In [7]:
df.to_csv('../data/processed/combined_dataset.csv', index=False)