In [131]:
import pdfplumber
import pandas as pd
import arabic_reshaper
from bidi.algorithm import get_display
from google import genai
from google.genai import types
from dotenv import load_dotenv
import os
load_dotenv()

key = os.getenv("GOOGLE_API_KEY")

In [132]:
client = genai.Client(api_key=key)
sys_instruct = """
You are an intelligent assistant designed to organize CSV column titles efficiently in both English and French. Your tasks are as follows:

1. **Reorder the Column Titles:**
   - Given a single CSV line containing column titles, identify and arrange them in their correct order. Handle both English and French column titles effectively.

2. **Return an Ordered CSV Line:**
   - Output the corrected column titles in CSV format as a single text line.

### Example Input (French)
```
Name,Code,1(Mécanique Physique point) du,Moyenne UE,Crédit UE,Algèbre 1,Analyse 1,Algorithmique données structure 1 de et,Moyenne UE
```

### Example Output
```
Name,Code,Mécanique du point Physique 1,Moyenne UE 1,Crédit UE,Algèbre 1,Analyse 1,Algorithmique et structure de données 1,Moyenne UE 2
```

### Guidelines
- Reorder column titles to ensure a natural and correct flow of text in both English and French.
- Maintain logical grouping when ordering the columns.
- **Preserve all columns; do not remove, merge, or omit any columns, even if they appear redundant.**
- **Preserve all words; do not reduce or modify the content within column titles.**
- **Ensure the total number of columns in the output matches the number of columns in the input.**
- **If identical column titles are repeated, add sequential numbering (e.g., `Moyenne UE 1`, `Moyenne UE 2`) — ensure no duplicate titles are removed.**
- **Remove any numbers at the start of a column title and reposition them appropriately.**
- **Remove all newline characters within column names to ensure each column appears as a continuous text string.**
- Pay close attention to French grammar rules when reordering text to ensure proper structure.

### Important Notes
- Ensure the output follows CSV format as a single text line.
- Consistency and clarity are key in the naming convention.
- Do not ask for clarification; complete the task directly based on the provided instructions.


"""

In [133]:
def fix_text_order(text):
    lines = text.strip().split("\n")
    corrected_text = ' '.join(lines).strip()
    return corrected_text

def generate_titles(titles):
    response = client.models.generate_content(
    model="gemini-2.0-flash",
    config=types.GenerateContentConfig(
        system_instruction=sys_instruct),
    contents=[titles],
    )
    return response.text


def reshape_arabic(text):
    try:
        return get_display(arabic_reshaper.reshape(text)) if text else text
    except Exception:
        return text 
    

def remove_newlines(text_list):
    for i in range(len(text_list)):
        text_list[i] = text_list[i].replace('\n', '')
    return text_list


In [134]:
all_data = []

i = 0

file_name = "Deliberation_2024_2025_L3_S5"

with pdfplumber.open(f"../data/{file_name}.pdf") as pdf:
    for page_num, page in enumerate(pdf.pages, start=1):
        table = page.extract_table()
        if table:
            if i == 0:
                table = table[1:]
                table[0][0] = "Name"
                table[0][1] = "Code"
                transformed_line = fix_text_order(",".join(table[0]))
                llm_process_line = generate_titles(transformed_line).split(",")
                table[0] = remove_newlines(llm_process_line)
            else:
                table = table[2:]
            i += 1
            reshaped_table = [[reshape_arabic(cell) for cell in row] for row in table]
            corrected_table = [
                [fix_text_order(cell) if cell else '' for cell in row]
                for row in reshaped_table
            ]
            all_data.extend(corrected_table)


df = pd.DataFrame(all_data)
df.to_csv(f"../data/{file_name}.csv", index=False, quoting=1, sep=",", header=False)

print(f"✅ Extracted {len(df)} rows successfully!")

✅ Extracted 218 rows successfully!
