In [1]:
import pdfplumber
import pandas as pd
import arabic_reshaper
from bidi.algorithm import get_display
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os
load_dotenv()

key = os.getenv("GOOGLE_API_KEY")

In [2]:
client = genai.Client(api_key=key)
sys_instruct = """
You are an intelligent assistant designed to organize CSV column titles efficiently in both English and French. Your tasks are as follows:

1. **Reorder the Column Titles:**
   - Given a single CSV line containing column titles, identify and arrange them in their correct order. Handle both English and French column titles effectively.

2. **Return an Ordered CSV Line:**
   - Output the corrected column titles in CSV format as a single text line.

### Example Input (French)
```
Name,Code,1(Mécanique Physique point) du,Moyenne UE,Crédit UE,Algèbre 1,Analyse 1,Algorithmique données structure 1 de et,Moyenne UE
```

### Example Output
```
Name,Code,Mécanique du point Physique 1,Moyenne UE 1,Crédit UE,Algèbre 1,Analyse 1,Algorithmique et structure de données 1,Moyenne UE 2
```

### Guidelines
- Reorder column titles to ensure a natural and correct flow of text in both English and French.
- Maintain logical grouping when ordering the columns.
- **Preserve all columns; do not remove, merge, or omit any columns, even if they appear redundant.**
- **Preserve all words; do not reduce or modify the content within column titles.**
- **Ensure the total number of columns in the output matches the number of columns in the input.**
- **If identical column titles are repeated, add sequential numbering (e.g., `Moyenne UE 1`, `Moyenne UE 2`) — ensure no duplicate titles are removed.**
- **Remove any numbers at the start of a column title and reposition them appropriately.**
- **Remove all newline characters within column names to ensure each column appears as a continuous text string.**
- Pay close attention to French grammar rules when reordering text to ensure proper structure.

### Important Notes
- Ensure the output follows CSV format as a single text line.
- Consistency and clarity are key in the naming convention.
- Do not ask for clarification; complete the task directly based on the provided instructions.


"""

In [3]:
def fix_text_order(text):
    lines = text.strip().split("\n")
    corrected_text = ' '.join(lines).strip()
    return corrected_text

def generate_titles(titles):
    response = client.models.generate_content(
    model="gemini-2.0-flash",
    config=types.GenerateContentConfig(
        system_instruction=sys_instruct),
    contents=[titles],
    )
    return response.text


def reshape_arabic(text):
    try:
        return get_display(arabic_reshaper.reshape(text)) if text else text
    except Exception:
        return text 
    

def remove_newlines(text_list):
    for i in range(len(text_list)):
        text_list[i] = text_list[i].replace('\n', '')
    return text_list


In [5]:
all_data = []

i = 0

file_name = "Deliberation_2024_2025_L2_S3"

with pdfplumber.open(f"../data/{file_name}.pdf") as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        table = page.extract_table()
        if table:
            if i == 0:
                table = table[1:]
                table[0][0] = "Name"
                table[0][1] = "Code"
                transformed_line = fix_text_order(",".join(table[0]))
                llm_process_line = generate_titles(transformed_line).split(",")
                table[0] = remove_newlines(llm_process_line)
            else:
                table = table[2:]
            i += 1
            reshaped_table = [[reshape_arabic(cell) for cell in row] for row in table]
            corrected_table = [
                [fix_text_order(cell) if cell else '' for cell in row]
                for row in reshaped_table
            ]
            all_data.extend(corrected_table)


df = pd.DataFrame(all_data)
df.to_csv(f"../data/{file_name}.csv", index=False, quoting=1, sep=",", header=False)

print(f"✅ Extracted {len(df)} rows successfully!")

✅ Extracted 376 rows successfully!


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,Name,Code,Systèmes d’information,Théorie des graphes,Algorithmique et Structures de Données (ASD),Ordinateurs Architecture des (AO),Mathématique Logique,Numériques Méthodes,Langue Étrangère 2,Moyenne UE 1,Crédit UE 1,Moyenne UE 2,Crédit UE 2,Moyenne UE 3,Crédit UE 3,Semestre Crédits du,Moyenne Semestre du,,
1,1 - ABDAOUI AYA,212136023497,13.3,6.2,10.46,09,6.55,8.42,7.49,00,10.1,8.1,9.1,04,10.0,10.0,02,15,8.98
2,2 - ABDESMED CHOUROUK,232336177720,13.9,8.75,11.84,09,4.38,8.5,6.44,00,11.95,11.4,11.68,08,14.0,14.0,02,19,9.91
3,3 - ABDI SAMY ZAKARIA,222236128502,0.0,0.0,0.0,00,0.0,0.0,0.0,00,0.0,0.0,0.0,00,0.0,0.0,00,00,0.0
4,4 - ABDOUN CELIA NOOR,232336124105,12.2,4.07,8.95,05,3.34,7.5,5.42,00,6.6,4.2,5.4,00,16.0,16.0,02,07,7.18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371,371 - ZEROUAL MOUAD,232336152706,1.65,3.48,2.38,00,4.8,2.4,3.6,00,4.71,3.1,3.91,00,12.0,12.0,02,02,3.82
372,372 - ZEROUGA ARIDJ,232336146815,12.85,7.8,10.83,09,5.9,6.05,5.98,00,12.52,10.95,11.74,08,17.0,17.0,02,19,9.62
373,373 - ZIATE ABIR,212136023755,11.25,0.0,6.75,05,0.0,1.95,0.98,00,2.57,4.2,3.39,00,16.5,16.5,02,07,4.36
374,374 - ZIRAOUI SARA INES,232336130612,0.0,0.0,0.0,00,0.0,0.0,0.0,00,0.23,0.0,0.12,00,0.0,0.0,00,00,0.03
