In [11]:
import random
from datetime import date
import pandas as pd

# === Param√®tres ===
n = 180000
pourcentage_pro = 0.2
pourcentage_particulier = 1 - pourcentage_pro  # nombre de clients

emplois_possibles = [
    {"emploi": "Ingenieur", "salaire_range": (2500, 7000)},
    {"emploi": "Enseignant", "salaire_range": (1200, 2500)},
    {"emploi": "Comptable", "salaire_range": (1800, 3000)},
    {"emploi": "Etudiant", "salaire_range": (300, 1000)},
    {"emploi": "Directeur", "salaire_range": (5000, 10000)},
    {"emploi": "Medecin", "salaire_range": (3000, 9000)},
    {"emploi": "Technicien", "salaire_range": (1000, 2500)},
    {"emploi": "Developpeur", "salaire_range": (2000, 6000)},
    {"emploi": "Vendeur", "salaire_range": (800, 1500)},
    {"emploi": "Agriculture", "salaire_range": (2000, 5000)},
    {"emploi": "Artisanat", "salaire_range": (2000, 5000)},
    {"emploi": "Serveur", "salaire_range": (600, 1200)},
    {"emploi": "Chauffeur", "salaire_range": (900, 1600)},
    {"emploi": "Consultant", "salaire_range": (3000, 8000)},
    {"emploi": "Journaliste", "salaire_range": (1500, 3500)},
    {"emploi": "Pharmacien", "salaire_range": (2800, 7000)},
    {"emploi": "Architecte", "salaire_range": (2200, 6000)},
    {"emploi": "Infirmier", "salaire_range": (1200, 2800)},
    {"emploi": "Agent de securite", "salaire_range": (700, 1500)},
    {"emploi": "Secretaire", "salaire_range": (900, 2000)},
    {"emploi": "Technico-commercial", "salaire_range": (1800, 4000)},
    {"emploi": "Analyste financier", "salaire_range": (3000, 7000)},
    {"emploi": "Chef de projet", "salaire_range": (3500, 8000)},
    {"emploi": "Ouvrier specialise", "salaire_range": (900, 1800)},
    {"emploi": "Psychologue", "salaire_range": (2500, 6000)},
    {"emploi": "Chomeur", "salaire_range": (0, 0)},
    {"emploi": "Industrie", "salaire_range": (4000, 9000)},
    {"emploi": "Expert-comptable", "salaire_range": (6000, 12000)},
    {"emploi": "Avocat d'affaires", "salaire_range": (8000, 15000)},
    {"emploi": "Chef d'entreprise", "salaire_range": (8000, 15000)},
    {"emploi": "Trader", "salaire_range": (1000, 20000)},
    {"emploi": "PDG", "salaire_range": (10000, 25000)},
    {"emploi": "Consultant international", "salaire_range": (12000, 30000)},
]

secteurs_pro = [
    "Chef d'entreprise", "Trader", "Consultant international", "Expert-comptable", "Avocat d'affaires", "PDG",
    "Chef de projet", "Industrie"
]

villes = [
    "Tunis", "La Manouba", "Zaghouan", "La Marsa", "Ben Arous", "El Mourouj", "Ariana",
    "Sfax", "Hammamet", "Nabeul", "Sousse", "Monastir", "Bizerte", "Beja", "Le Kef", "Gabes",
    "Kairouan", "Kasserine", "Ben Gardane", "Zarzis", "Sidi Bouzid", "Jendouba", "Tozeur", "Kebili", "Mednine", "Djerba"
]

sexes = ["M", "F"]

niveau_education_par_emploi = {
    "Ingenieur": ["Master", "Doctorat"],
    "Enseignant": ["Licence", "Master", "Doctorat"],
    "Comptable": ["Licence", "Master"],
    "Etudiant": ["Baccalaureat", "Licence"],
    "Directeur": ["Master", "Doctorat"],
    "Medecin": ["Doctorat"],
    "Technicien": ["BTS", "Licence"],
    "Developpeur": ["Licence", "Master"],
    "Vendeur": ["Baccalaureat"],
    "Serveur": ["Baccalaureat"],
    "Chauffeur": ["Baccalaureat"],
    "Consultant": ["Master", "Doctorat"],
    "Journaliste": ["Licence", "Master"],
    "Pharmacien": ["Doctorat"],
    "Architecte": ["Master", "Doctorat"],
    "Infirmier": ["Licence", "Master"],
    "Agent de securite": ["Baccalaureat"],
    "Secretaire": ["Baccalaureat"],
    "Technico-commercial": ["Licence", "Master"],
    "Analyste financier": ["Master", "Doctorat"],
    "Chef de projet": ["Master", "Doctorat"],
    "Ouvrier specialise": ["Baccalaureat", "BTS"],
    "Psychologue": ["Master", "Doctorat"],
    "Chomeur": ["Sans diplome"],
    "Agriculture": ["Sans diplome"],
    "Artisanat": ["Sans diplome"],
    "Industrie": ["Licence", "Master"],
    "Expert-comptable": ["Master", "Doctorat"],
    "Avocat d'affaires": ["Doctorat"],
    "Chef d'entreprise": ["Master", "Doctorat"],
    "Trader": ["Master", "Doctorat"],
    "PDG": ["Doctorat"],
    "Consultant international": ["Doctorat", "Master"]
}

def generer_date_naissance(age):
    today = date.today()
    annee = today.year - age
    mois = random.randint(1, 12)
    jour = random.randint(1, 28)  # simplification pour √©viter les dates invalides
    return f"{annee}-{mois:02d}-{jour:02d}"

def salaire_avec_age(age, salaire_min, salaire_max):
    age_normalise = (age - 18) / (65 - 18)
    salaire = salaire_min + age_normalise * (salaire_max - salaire_min)
    variation = salaire * random.uniform(-0.05, 0.05)
    return round(max(salaire_min, min(salaire_max, salaire + variation)))

def generate_client_id():
    return f"{random.randint(1000000, 9999999)}"

def generate_all_client_ids(n):
    ids = set()
    while len(ids) < n:
        ids.add(generate_client_id())
    return list(ids)

def choisir_niveau_education(emploi, age):
    niveaux_possibles = niveau_education_par_emploi.get(emploi, ["Baccalaureat", "Licence", "Master", "Doctorat"])
    niveaux_valides = []
    for niveau in niveaux_possibles:
        if niveau == "Baccalaureat" and age >= 18:
            niveaux_valides.append(niveau)
        elif niveau == "BTS" and 18 <= age <= 22:
            niveaux_valides.append(niveau)
        elif niveau == "Licence" and 18 <= age <= 24:
            niveaux_valides.append(niveau)
        elif niveau == "Master" and 22 <= age <= 28:
            niveaux_valides.append(niveau)
        elif niveau == "Doctorat" and age >= 26:
            niveaux_valides.append(niveau)
        elif niveau == "Sans diplome":
            niveaux_valides.append(niveau)
    if not niveaux_valides:
        niveaux_valides = niveaux_possibles
    return random.choice(niveaux_valides)

def categoriser_client(salaire, emploi):
    if emploi == "Etudiant":
        return "Etudiant"
    if salaire == 0:
        return "Sans revenu"
    if salaire < 1500:
        return "Particulier"
    if salaire < 10000:
        return "Professionnel"
    if salaire < 20000:
        return "Entreprise"
    return "VIP"

def generer_clients(n):
    ids = generate_all_client_ids(n)
    clients = []
    for i in range(n):
        sexe = random.choice(sexes)
        emploi_data = random.choice(emplois_possibles)
        emploi = emploi_data["emploi"]
        salaire_min, salaire_max = emploi_data["salaire_range"]
        age = random.randint(18, 65)
        salaire = salaire_avec_age(age, salaire_min, salaire_max)
        type_client = categoriser_client(salaire, emploi)
        niveau = choisir_niveau_education(emploi, age)

        if emploi == "Etudiant" or (18 <= age <= 22):
            statut_marital = random.choices(
                ["Celibataire", "Marie", "Divorce", "Veuf"],
                weights=[80, 10, 5, 5],
                k=1
            )[0]
        else:
            if sexe == "M":
                statut_possibles = ["Marie"] * 70 + ["Celibataire"] * 15 + ["Divorce"] * 10 + ["Veuf"] * 5
            else:
                statut_possibles = ["Marie"] * 60 + ["Celibataire"] * 20 + ["Divorce"] * 15 + ["Veuf"] * 5
            statut_marital = random.choice(statut_possibles)

        client = {
            "id_client": ids[i],
            "type_client": type_client,
            "sexe": sexe,
            "age": age,
            "date_naissance": generer_date_naissance(age),
            "salaire": salaire,
            "emploi": emploi,
            "statut_marital": statut_marital,
            "niveau_education": niveau,
            "ville": random.choice(villes)
        }

        clients.append(client)
        if (i + 1) % 10000 == 0:
            print(f"{i + 1} clients g√©n√©r√©s...")

    return pd.DataFrame(clients)

# === Ex√©cution ===
print("‚è≥ G√©n√©ration des clients en cours...")
df_clients = generer_clients(n)

# Afficher les 5 premi√®res lignes
print("\nüìã Aper√ßu des donn√©es g√©n√©r√©es :")
print(df_clients.head())

# === Export CSV ===
chemin_fichier = r"C:\Users\MSI\Desktop\Attijari_bank\stage\clients.csv"
df_clients.to_csv(chemin_fichier, index=False)
print(f"‚úÖ Fichier CSV g√©n√©r√© avec {n} clients ici : {chemin_fichier}")


‚è≥ G√©n√©ration des clients en cours...
10000 clients g√©n√©r√©s...
20000 clients g√©n√©r√©s...
30000 clients g√©n√©r√©s...
40000 clients g√©n√©r√©s...
50000 clients g√©n√©r√©s...
60000 clients g√©n√©r√©s...
70000 clients g√©n√©r√©s...
80000 clients g√©n√©r√©s...
90000 clients g√©n√©r√©s...
100000 clients g√©n√©r√©s...
110000 clients g√©n√©r√©s...
120000 clients g√©n√©r√©s...
130000 clients g√©n√©r√©s...
140000 clients g√©n√©r√©s...
150000 clients g√©n√©r√©s...
160000 clients g√©n√©r√©s...
170000 clients g√©n√©r√©s...
180000 clients g√©n√©r√©s...

üìã Aper√ßu des donn√©es g√©n√©r√©es :
  id_client    type_client sexe  age date_naissance  salaire  \
0   1702993  Professionnel    M   60     1965-05-10     7264   
1   4762995  Professionnel    M   20     2005-12-24     8190   
2   4234274    Particulier    M   19     2006-03-10      704   
3   4892162  Professionnel    F   29     1996-01-11     5201   
4   4747424    Particulier    M   21     2004-12-07      949   

               emplo

In [13]:
liste_ids_clients = df_clients['id_client'].astype(str).tolist()

In [15]:
# --- Affichage de contr√¥le ---
print(f"‚úÖ {len(liste_ids_clients)} ID clients extraits.")
print(liste_ids_clients[:10])  # Affich

‚úÖ 180000 ID clients extraits.
['1702993', '4762995', '4234274', '4892162', '4747424', '9049237', '9393394', '5467470', '7793844', '6014713']


In [19]:
df_clients.head()

Unnamed: 0,id_client,type_client,sexe,age,date_naissance,salaire,emploi,statut_marital,niveau_education,ville
0,1702993,Professionnel,M,60,1965-05-10,7264,Consultant,Divorce,Doctorat,Sidi Bouzid
1,4762995,Professionnel,M,20,2005-12-24,8190,Chef d'entreprise,Celibataire,Doctorat,Mednine
2,4234274,Particulier,M,19,2006-03-10,704,Agent de securite,Celibataire,Baccalaureat,Sidi Bouzid
3,4892162,Professionnel,F,29,1996-01-11,5201,Trader,Marie,Doctorat,La Marsa
4,4747424,Particulier,M,21,2004-12-07,949,Ouvrier specialise,Celibataire,Baccalaureat,La Manouba
