In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
import csv

# ========================================
# CHARGEMENT DES DONN√âES SOURCES
# ========================================
print("üìã Chargement des donn√©es sources...")





def detect_separator(file_path, encoding="utf-8-sig"):
    """D√©tecte automatiquement le s√©parateur d'un fichier CSV."""
    with open(file_path, 'r', encoding=encoding) as f:
        sample = f.read(2048)  # lit un √©chantillon
        sniffer = csv.Sniffer()
        dialect = sniffer.sniff(sample)
        return dialect.delimiter  # renvoie le s√©parateur d√©tect√©

# ======================
# üìã Chargement des donn√©es sources avec Sniffer
# ======================
print("üìã D√©tection des s√©parateurs et chargement...")

try:
    # Fichier clients.csv
    clients_path = r"C:\Users\MSI\Desktop\Attijari_bank\stage\clients.csv"
    sep_clients = detect_separator(clients_path)
    print(f"   ‚Ü™ S√©parateur clients.csv d√©tect√© : '{sep_clients}'")
    
    clients_df = pd.read_csv(
        clients_path,
        sep=sep_clients,
        encoding="utf-8-sig",
        dtype={"id_client": str}
    )

    # Fichier comptes.csv
    comptes_path = r"C:\Users\MSI\Desktop\Attijari_bank\stage\comptes.csv"
    sep_comptes = detect_separator(comptes_path)
    print(f"   ‚Ü™ S√©parateur comptes.csv d√©tect√© : '{sep_comptes}'")
    
    comptes_df = pd.read_csv(
        comptes_path,
        sep=sep_comptes,
        encoding="utf-8-sig",
        dtype={"id_compte": str, "id_client": str}
    )

    # Fichier operations.csv
    operations_path = r"C:\Users\MSI\Desktop\Attijari_bank\stage\operations.csv"
    sep_operations = detect_separator(operations_path)
    print(f"   ‚Ü™ S√©parateur operations.csv d√©tect√© : '{sep_operations}'")
    
    operations_df = pd.read_csv(
        operations_path,
        sep=sep_operations,
        encoding="utf-8-sig",
        dtype={"id_compte": str}
    )
    print("‚úÖ Donn√©es sources charg√©es avec succ√®s")
    print(f"   - Clients: {len(clients_df):,} lignes")
    print(f"   - Comptes: {len(comptes_df):,} lignes") 
    print(f"   - Op√©rations: {len(operations_df):,} lignes")
    
except FileNotFoundError as e:
    print(f"‚ùå Erreur de chargement: {e}")
    exit(1)

# ========================================
# CONVERSION DES DATES
# ========================================
print("\nüóìÔ∏è Conversion des dates...")
comptes_df['date_ouverture'] = pd.to_datetime(
    comptes_df['date_ouverture'], 
    format='%d/%m/%Y', 
    errors='coerce'
)

operations_df['date_operation'] = pd.to_datetime(operations_df['date_operation'], errors='coerce')

# Nettoyage des dates invalides
dates_invalides_comptes = comptes_df['date_ouverture'].isna().sum()
dates_invalides_ops = operations_df['date_operation'].isna().sum()

if dates_invalides_comptes > 0:
    print(f"‚ö†Ô∏è  {dates_invalides_comptes} dates d'ouverture invalides supprim√©es")
    comptes_df = comptes_df.dropna(subset=['date_ouverture'])

if dates_invalides_ops > 0:
    print(f"‚ö†Ô∏è  {dates_invalides_ops} dates d'op√©ration invalides supprim√©es")
    operations_df = operations_df.dropna(subset=['date_operation'])

# ========================================
# 1. DIMENSION CLIENTS (DIM_CLIENTS)
# ========================================
print("\nüë• Cr√©ation de dim_clients...")

# Suppression des doublons et nettoyage
dim_clients = clients_df.drop_duplicates(subset=['id_client']).copy()

# Nettoyage des donn√©es
dim_clients['sexe'] = dim_clients['sexe'].fillna('Non sp√©cifi√©')
dim_clients['ville'] = dim_clients['ville'].fillna('Non sp√©cifi√©e')
dim_clients['emploi'] = dim_clients['emploi'].fillna('Non sp√©cifi√©')
dim_clients['statut_marital'] = dim_clients['statut_marital'].fillna('Non sp√©cifi√©')
dim_clients['niveau_education'] = dim_clients['niveau_education'].fillna('Non sp√©cifi√©')

# Ajout de colonnes descriptives
dim_clients['tranche_age'] = pd.cut(
    dim_clients['age'], 
    bins=[0, 25, 35, 45, 55, 100], 
    labels=['18-25', '26-35', '36-45', '46-55', '56+']
)

dim_clients['tranche_salaire'] = pd.cut(
    dim_clients['salaire'],
    bins=[0, 1000, 2500, 5000, 10000, float('inf')],
    labels=['0-1000', '1001-2500', '2501-5000', '5001-10000', '10000+']
)

# R√©organisation des colonnes
dim_clients = dim_clients[[
    'id_client', 'type_client', 'sexe', 'age', 'tranche_age',
    'date_naissance', 'salaire', 'tranche_salaire', 'emploi', 
    'statut_marital', 'niveau_education', 'ville'
]]

doublons_clients = len(clients_df) - len(dim_clients)
print(f"   ‚úÖ {len(dim_clients):,} clients uniques ({doublons_clients} doublons supprim√©s)")

# ========================================
# 2. DIMENSION COMPTES (DIM_COMPTES)
# ========================================
print("\nüè¶ Cr√©ation de dim_comptes...")

# Suppression des doublons
dim_comptes = comptes_df.drop_duplicates(subset=['id_compte']).copy()

# Suppression de la colonne id_date_ouverture (non n√©cessaire pour ce mod√®le)

# Nettoyage des donn√©es
dim_comptes['type_carte'] = dim_comptes['type_carte'].fillna('Aucune carte')
dim_comptes['agence'] = dim_comptes['agence'].fillna('Agence non sp√©cifi√©e')

# Ajout de colonnes descriptives
dim_comptes['anciennete_compte'] = (datetime.now() - dim_comptes['date_ouverture']).dt.days
dim_comptes['tranche_anciennete'] = pd.cut(
    dim_comptes['anciennete_compte'],
    bins=[0, 365, 1095, 2190, float('inf')],
    labels=['< 1 an', '1-3 ans', '3-6 ans', '6+ ans']
)

dim_comptes['tranche_solde'] = pd.cut(
    dim_comptes['solde_initial'],
    bins=[0, 500, 2000, 10000, 50000, float('inf')],
    labels=['0-500', '501-2000', '2001-10000', '10001-50000', '50000+']
)

# R√©organisation des colonnes (sans type_client)
dim_comptes = dim_comptes[[
    'id_compte', 'id_client', 'type_compte', 'type_carte',
    'date_ouverture', 'solde_initial', 'tranche_solde',
    'etat_compte', 'eligible_chequier', 'deja_cheque', 'demande_cheque',
    'agence', 'anciennete_compte', 'tranche_anciennete'
]]

doublons_comptes = len(comptes_df) - len(dim_comptes)
print(f"   ‚úÖ {len(dim_comptes):,} comptes uniques ({doublons_comptes} doublons supprim√©s)")

# ========================================
# 3. DIMENSION TEMPS (DIM_DATE)
# ========================================
print("\nüìÖ Cr√©ation de dim_date (2022-2024)...")

# Cr√©ation de la plage de dates compl√®te
start_date = datetime(2022, 1, 1)
end_date = datetime(2024, 12, 31)
date_range = pd.date_range(start=start_date, end=end_date, freq='D')

dim_date = pd.DataFrame({'date_calendrier': date_range})

# Ajout de toutes les informations temporelles
dim_date['id_date'] = dim_date['date_calendrier'].dt.strftime('%Y%m%d')
dim_date['jour'] = dim_date['date_calendrier'].dt.day
dim_date['mois'] = dim_date['date_calendrier'].dt.month
dim_date['annee'] = dim_date['date_calendrier'].dt.year
dim_date['trimestre'] = dim_date['date_calendrier'].dt.quarter
dim_date['semestre'] = np.where(dim_date['mois'] <= 6, 1, 2)
dim_date['jour_semaine'] = dim_date['date_calendrier'].dt.dayofweek + 1  # 1=Lundi, 7=Dimanche
dim_date['numero_semaine'] = dim_date['date_calendrier'].dt.isocalendar().week
dim_date['jour_annee'] = dim_date['date_calendrier'].dt.dayofyear

# Noms en fran√ßais
dim_date['nom_jour'] = dim_date['date_calendrier'].dt.day_name(locale='C').map({
    'Monday': 'Lundi', 'Tuesday': 'Mardi', 'Wednesday': 'Mercredi',
    'Thursday': 'Jeudi', 'Friday': 'Vendredi', 'Saturday': 'Samedi', 'Sunday': 'Dimanche'
})

dim_date['nom_mois'] = dim_date['date_calendrier'].dt.month_name(locale='C').map({
    'January': 'Janvier', 'February': 'F√©vrier', 'March': 'Mars', 'April': 'Avril',
    'May': 'Mai', 'June': 'Juin', 'July': 'Juillet', 'August': 'Ao√ªt',
    'September': 'Septembre', 'October': 'Octobre', 'November': 'Novembre', 'December': 'D√©cembre'
})

# Indicateurs bool√©ens
dim_date['est_weekend'] = dim_date['jour_semaine'].isin([6, 7])
dim_date['est_debut_mois'] = dim_date['jour'] <= 5
dim_date['est_fin_mois'] = dim_date['jour'] >= 25

# P√©riodes descriptives
dim_date['nom_trimestre'] = 'T' + dim_date['trimestre'].astype(str) + ' ' + dim_date['annee'].astype(str)
dim_date['nom_semestre'] = 'S' + dim_date['semestre'].astype(str) + ' ' + dim_date['annee'].astype(str)
dim_date['mois_annee'] = dim_date['nom_mois'] + ' ' + dim_date['annee'].astype(str)

print(f"   ‚úÖ {len(dim_date):,} dates cr√©√©es (du {start_date.strftime('%d/%m/%Y')} au {end_date.strftime('%d/%m/%Y')})")

# ========================================
# 4. DIMENSION TYPE OP√âRATION (DIM_TYPE_OPERATION)
# ========================================
print("\nüí≥ Cr√©ation de dim_type_operation...")

# Extraction des types d'op√©ration uniques
types_operations_uniques = sorted(operations_df['type_operation'].dropna().unique())

dim_type_operation = pd.DataFrame({
    'id_type_operation': range(1, len(types_operations_uniques) + 1),
    'type_operation': types_operations_uniques
})

# Ajout de colonnes descriptives
def categoriser_operation(type_op):
    type_op_lower = type_op.lower()
    if any(word in type_op_lower for word in ['retrait', 'achat', 'virement sortant', 'prelevement', 'frais', 'commission', 'paiement', 'transfert international sortant', 'emission']):
        return 'DEBIT'
    elif any(word in type_op_lower for word in ['depot', 'virement entrant', 'salaire', 'remboursement', 'transfert international entrant', 'interets', 'prime', 'allocation', 'pension', 'indemnite', 'cashback', 'bonus', 'encaissement']):
        return 'CREDIT'
    else:
        return 'AUTRE'

def categoriser_canal(type_op):
    type_op_lower = type_op.lower()
    if 'dab' in type_op_lower:
        return 'DAB'
    elif 'tpe' in type_op_lower or 'carte' in type_op_lower:
        return 'TPE'
    elif 'ligne' in type_op_lower:
        return 'Internet'
    elif 'agence' in type_op_lower:
        return 'Agence'
    elif 'cheque' in type_op_lower:
        return 'Ch√®que'
    elif 'automatique' in type_op_lower:
        return 'Automatique'
    else:
        return 'Autre'

def categoriser_nature(type_op):
    type_op_lower = type_op.lower()
    if any(word in type_op_lower for word in ['salaire', 'prime', 'allocation', 'pension', 'indemnite']):
        return 'Revenus'
    elif any(word in type_op_lower for word in ['supermarche', 'restaurant', 'carburant', 'pharmacie']):
        return 'D√©penses courantes'
    elif any(word in type_op_lower for word in ['frais', 'commission']):
        return 'Frais bancaires'
    elif any(word in type_op_lower for word in ['virement', 'transfert']):
        return 'Virements'
    elif any(word in type_op_lower for word in ['depot', 'retrait']):
        return 'Esp√®ces'
    elif 'cheque' in type_op_lower:
        return 'Ch√®ques'
    else:
        return 'Autres'

dim_type_operation['sens_operation'] = dim_type_operation['type_operation'].apply(categoriser_operation)
dim_type_operation['canal_operation'] = dim_type_operation['type_operation'].apply(categoriser_canal)
dim_type_operation['nature_operation'] = dim_type_operation['type_operation'].apply(categoriser_nature)

print(f"   ‚úÖ {len(dim_type_operation):,} types d'op√©ration cr√©√©s")

# ========================================
# 5. TABLE DE FAITS (FACT_OPERATIONS)
# ========================================
print("\nüíº Cr√©ation de fact_operations...")

# Pr√©paration de la table de faits
fact_operations = operations_df.copy()

# Ajout de l'id_date (r√©f√©rence √† la date de l'op√©ration)
fact_operations['id_date'] = fact_operations['date_operation'].dt.strftime('%Y%m%d')

# Jointure avec dim_type_operation pour r√©cup√©rer l'id
fact_operations = fact_operations.merge(
    dim_type_operation[['type_operation', 'id_type_operation']], 
    on='type_operation', 
    how='left'
)

# Nettoyage et ajout de m√©triques calcul√©es
fact_operations['montant_absolu'] = fact_operations['montant_total'].abs()
fact_operations['est_gros_montant'] = fact_operations['montant_absolu'] > 1000

# S√©lection des colonnes finales pour la table de faits
fact_operations = fact_operations[[
    'id_operation', 'id_compte', 'id_date', 'id_type_operation',
    'montant_total', 'montant_absolu', 'lieu_operation', 'montant_par_cheque', 
    'nombre_cheques', 'sens_operation', 'solde_avant', 'solde_apres', 'est_gros_montant'
]]

print(f"   ‚úÖ {len(fact_operations):,} op√©rations dans la table de faits")

# ========================================
# V√âRIFICATIONS D'INT√âGRIT√â
# ========================================
print("\nüîç V√©rifications d'int√©grit√© des cl√©s √©trang√®res...")

# V√©rification id_compte
comptes_manquants = set(fact_operations['id_compte']) - set(dim_comptes['id_compte'])
print(f"   - Comptes manquants dans dim_comptes: {len(comptes_manquants)}")

# V√©rification id_client
clients_manquants = set(dim_comptes['id_client']) - set(dim_clients['id_client'])
print(f"   - Clients manquants dans dim_clients: {len(clients_manquants)}")

# V√©rification id_date (r√©f√©rence aux dates d'op√©ration)
dates_manquantes = set(fact_operations['id_date']) - set(dim_date['id_date'])
print(f"   - Dates manquantes dans dim_date: {len(dates_manquantes)}")

# V√©rification id_type_operation
ops_sans_type = fact_operations['id_type_operation'].isna().sum()
print(f"   - Op√©rations sans type: {ops_sans_type}")

# ========================================
# STATISTIQUES DESCRIPTIVES
# ========================================
print("\nüìä Statistiques descriptives...")

print(f"\nDIMENSIONS:")
print(f"   - dim_clients: {len(dim_clients):,} lignes")
print(f"   - dim_comptes: {len(dim_comptes):,} lignes")
print(f"   - dim_date: {len(dim_date):,} lignes")
print(f"   - dim_type_operation: {len(dim_type_operation):,} lignes")

print(f"\nFAITS:")
print(f"   - fact_operations: {len(fact_operations):,} lignes")

print(f"\nR√âPARTITION PAR TYPE CLIENT:")
repartition_clients = dim_clients['type_client'].value_counts()
for type_client, count in repartition_clients.items():
    print(f"   - {type_client}: {count:,} ({count/len(dim_clients)*100:.1f}%)")

# ========================================
# EXPORT DES FICHIERS
# ========================================
print("\nüíæ Export des fichiers CSV...")

try:
    # Export des dimensions
    dim_clients.to_csv(r"C:\Users\MSI\Desktop\Attijari_bank\stage\dim_clients.csv", index=False, encoding='utf-8-sig')
    dim_comptes.to_csv(r"C:\Users\MSI\Desktop\Attijari_bank\stage\dim_comptes.csv", index=False, encoding='utf-8-sig')
    dim_date.to_csv(r"C:\Users\MSI\Desktop\Attijari_bank\stage\dim_date.csv", index=False, encoding='utf-8-sig')
    dim_type_operation.to_csv(r"C:\Users\MSI\Desktop\Attijari_bank\stage\dim_type_operation.csv", index=False, encoding='utf-8-sig')
    
    # Export de la table de faits
    fact_operations.to_csv(r"C:\Users\MSI\Desktop\Attijari_bank\stage\fact_operations.csv", index=False, encoding='utf-8-sig')
    
    print("‚úÖ Tous les fichiers ont √©t√© export√©s avec succ√®s!")
    print("\nFichiers g√©n√©r√©s:")
    print("   - dim_clients.csv")
    print("   - dim_comptes.csv") 
    print("   - dim_date.csv")
    print("   - dim_type_operation.csv")
    print("   - fact_operations.csv")
    
except Exception as e:
    print(f"‚ùå Erreur lors de l'export: {e}")

print("\nüéâ Extraction termin√©e avec succ√®s!")
print("üîß Le datawarehouse est pr√™t pour l'analyse et la cr√©ation de tableaux de bord.")

üìã Chargement des donn√©es sources...
üìã D√©tection des s√©parateurs et chargement...
   ‚Ü™ S√©parateur clients.csv d√©tect√© : ','
   ‚Ü™ S√©parateur comptes.csv d√©tect√© : ';'
   ‚Ü™ S√©parateur operations.csv d√©tect√© : ','
‚úÖ Donn√©es sources charg√©es avec succ√®s
   - Clients: 180,000 lignes
   - Comptes: 359,723 lignes
   - Op√©rations: 67,411,391 lignes

üóìÔ∏è Conversion des dates...

üë• Cr√©ation de dim_clients...
   ‚úÖ 180,000 clients uniques (0 doublons supprim√©s)

üè¶ Cr√©ation de dim_comptes...
   ‚úÖ 359,723 comptes uniques (0 doublons supprim√©s)

üìÖ Cr√©ation de dim_date (2022-2024)...
   ‚úÖ 1,096 dates cr√©√©es (du 01/01/2022 au 31/12/2024)

üí≥ Cr√©ation de dim_type_operation...
   ‚úÖ 33 types d'op√©ration cr√©√©s

üíº Cr√©ation de fact_operations...
   ‚úÖ 67,411,391 op√©rations dans la table de faits

üîç V√©rifications d'int√©grit√© des cl√©s √©trang√®res...
   - Comptes manquants dans dim_comptes: 0
   - Clients manquants dans dim_clients: 0


In [2]:
fact_operations.head()

Unnamed: 0,id_operation,id_compte,id_date,id_type_operation,montant_total,montant_absolu,lieu_operation,montant_par_cheque,nombre_cheques,sens_operation,solde_avant,solde_apres,est_gros_montant
0,OP0000001,9070725833,20220110,9,195.03,195.03,Sidi Bouzid,195.03,1,CREDIT,8706.1,8901.13,False
1,OP0000002,9070725833,20220116,2,539.34,539.34,Sidi Bouzid,0.0,0,DEBIT,8901.13,8361.79,False
2,OP0000003,9070725833,20220124,9,876.76,876.76,Sidi Bouzid,438.38,2,CREDIT,8361.79,9238.55,False
3,OP0000004,9070725833,20220125,11,750.24,750.24,Sidi Bouzid,375.12,2,DEBIT,9238.55,8488.31,False
4,OP0000005,9070725833,20220209,12,470.8,470.8,Sidi Bouzid,94.16,5,CREDIT,8488.31,8959.11,False


In [5]:
dim_clients['id_client'].dtype  # string


dtype('O')

In [7]:
missing_clients = set(dim_comptes['id_client']) - set(dim_clients['id_client'])
print(list(missing_clients)[:10])  # Afficher les 10 premiers


[]
