In [114]:
import pandas as pd
from pandas.api.types import is_float_dtype, is_integer_dtype, is_object_dtype
import os
from dotenv import load_dotenv
from supabase import create_client, Client
import psycopg2
from psycopg2 import sql
import unicodedata
from google import genai
from google.genai import types

In [115]:
load_dotenv()

file_name = "Deliberation_2024_2025_L3_S5"
url: str = os.getenv("SUPABASE_URL")
key: str = os.getenv("SUPABASE_KEY")
key_llm = os.getenv("GOOGLE_API_KEY")
db_url: str = os.getenv("DATABASE_URL")
client: Client = create_client(url, key)
client_llm = genai.Client(api_key=key_llm)

In [116]:
data = pd.read_csv(f"../data/{file_name}_postprocess.csv")

def replace_french_e(text):
    normalized_text = unicodedata.normalize('NFD', text)
    transformed_text = ''.join(char for char in normalized_text if not unicodedata.combining(char))
    return transformed_text

def remove_newlines(text_list):
    for i in range(len(text_list)):
        text_list[i] = text_list[i].replace('\n', '')
    return text_list

In [117]:
sys_instruct = """
You are a text formatting assistant specializing in data preparation for databases. When given an array of elements, follow these precise instructions:

- Accept any array of string elements.
- Convert all characters to lowercase.
- Replace all spaces and special characters (e.g., apostrophes) with underscores (`_`).
- **Ensure the final output is a single, strictly comma-separated string with no spaces between elements and no trailing comma.**

**Example Input:**
Name,Code,Compilation,Système d'exploitation 2,Moyenne UE 1,Crédit UE 1,Génie Logiciel 2,Interface Machine Homme,Moyenne UE 2,Crédit UE 2,Probabilités et Statistiques,Programmation Linéaire,Moyenne UE 3,Crédit UE 3,Economie et veille stratégique numérique,Moyenne UE 4,Crédit UE 4,Crédits du Semestre,Moyenne du Semestre

**Expected Output:**
`name,code,compilation,systeme_d_exploitation_2,moyenne_ue_1,credit_ue_1,genie_logiciel_2,interface_machine_homme,moyenne_ue_2,credit_ue_2,probabilites_et_statistiques,programmation_lineaire,moyenne_ue_3,credit_ue_3,economie_et_veille_strategique_numerique,moyenne_ue_4,credit_ue_4,credits_du_semestre,moyenne_du_semestre`

Respond only with the formatted output unless otherwise instructed.
"""

def generate_db_names(names):
    response = client_llm.models.generate_content(
    model="gemini-2.0-flash",
    config=types.GenerateContentConfig(
        system_instruction=sys_instruct),
    contents=[names],
    )
    return response.text

In [118]:
# creating mappings & db column names
db_cols = generate_db_names(",".join(data.columns.to_list())).split(",")
db_cols = remove_newlines(db_cols)
cols = data.columns.to_list()
mappings = dict(zip(cols, db_cols))
print(mappings)

{'Name': 'name', 'Code': 'code', 'Compilation': 'compilation', "Système d'exploitation 2": 'systeme_d_exploitation_2', 'Moyenne UE 1': 'moyenne_ue_1', 'Crédit UE 1': 'credit_ue_1', 'Génie Logiciel 2': 'genie_logiciel_2', 'Interface Machine Homme': 'interface_machine_homme', 'Moyenne UE 2': 'moyenne_ue_2', 'Crédit UE 2': 'credit_ue_2', 'Probabilités et Statistiques': 'probabilites_et_statistiques', 'Programmation Linéaire': 'programmation_lineaire', 'Moyenne UE 3': 'moyenne_ue_3', 'Crédit UE 3': 'credit_ue_3', 'Economie et veille numérique stratégique': 'economie_et_veille_numerique_strategique', 'Moyenne UE 4': 'moyenne_ue_4', 'Crédit UE 4': 'credit_ue_4', 'Crédits du Semestre': 'credits_du_semestre', 'Moyenne du Semestre': 'moyenne_du_semestre'}


In [119]:
schema = []
table_cols = data.columns.to_list()

for col in table_cols:
    is_int = is_integer_dtype(data[col])
    is_float = is_float_dtype(data[col])
    is_object = is_object_dtype(data[col])

    if is_float:
        schema.append([mappings[col], "float8"])
    if is_int:
        schema.append([mappings[col], "int8"])
    if is_object:
        schema.append([mappings[col], "text"])


dict_schema = dict(schema)
# table name 
table_name = f"analysis_{file_name}"
columns = ", ".join([f"{col} {dtype}" for col, dtype in dict_schema.items()])

# table creation query
create_table_query = f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
    {columns}
)
"""

# transforming col names
data.columns = [mappings[col] for col in data.columns]

print(create_table_query)
print(dict_schema)


CREATE TABLE IF NOT EXISTS analysis_Deliberation_2024_2025_L3_S5 (
    id UUID DEFAULT gen_random_uuid() PRIMARY KEY,
    name text, code text, compilation float8, systeme_d_exploitation_2 float8, moyenne_ue_1 float8, credit_ue_1 int8, genie_logiciel_2 float8, interface_machine_homme float8, moyenne_ue_2 float8, credit_ue_2 int8, probabilites_et_statistiques float8, programmation_lineaire float8, moyenne_ue_3 float8, credit_ue_3 int8, economie_et_veille_numerique_strategique float8, moyenne_ue_4 float8, credit_ue_4 int8, credits_du_semestre int8, moyenne_du_semestre float8
)

{'name': 'text', 'code': 'text', 'compilation': 'float8', 'systeme_d_exploitation_2': 'float8', 'moyenne_ue_1': 'float8', 'credit_ue_1': 'int8', 'genie_logiciel_2': 'float8', 'interface_machine_homme': 'float8', 'moyenne_ue_2': 'float8', 'credit_ue_2': 'int8', 'probabilites_et_statistiques': 'float8', 'programmation_lineaire': 'float8', 'moyenne_ue_3': 'float8', 'credit_ue_3': 'int8', 'economie_et_veille_numeriqu

In [120]:
conn_details = {
    "dbname": os.getenv("DB_NAME"),
    "user": os.getenv("DB_USER"),
    "password": os.getenv("DB_PASSWORD"),
    "host": os.getenv("DB_HOST"),
    "port": os.getenv("DB_PORT"),
    "sslmode": os.getenv("DB_SSLMODE")
}



try:
    conn = psycopg2.connect(**conn_details)
    print("Connection successful!")

    cursor = conn.cursor()
    cursor.execute(
        sql.SQL("""
            SELECT EXISTS (
                SELECT 1 FROM information_schema.tables
                WHERE table_name = %s
            );
        """),
        [table_name]
    )

    table_exists = cursor.fetchone()[0]

    if table_exists:
        print(f"Table '{table_name}' already exists.")
    else:
        cursor.execute(create_table_query)
        conn.commit()
        print(f"Table '{table_name}' created successfully.")

    cursor.close()
    conn.close()

except Exception as e:
    print(f"Failed to connect or execute query: {e}")


Connection successful!
Table 'analysis_Deliberation_2024_2025_L3_S5' created successfully.


In [123]:
# inserting data
i = 0
records = data.to_dict(orient="records")
response = client.table(table_name.lower()).insert(records).execute()
print("Records inserted successfully")

Records inserted successfully
