In [6]:
# ! pip install faker

In [2]:
import pandas as pd
import numpy as np
import random
from faker import Faker
from datetime import datetime, timedelta

In [3]:
fake = Faker()
start_date = datetime.strptime("2005-01-01", "%Y-%m-%d")
end_date = datetime.strptime("2023-12-31", "%Y-%m-%d")

In [4]:
def dates_entre_deux(start_date, end_date):
  """
  Renvoie une liste de dates entre deux dates données.

  Args:
    start_date: La date de début de l'intervalle.
    end_date: La date de fin de l'intervalle.

  Returns:
    Une liste de dates datetime.
  """
  dates = []
  while start_date <= end_date:
    dates.append(start_date)
    start_date += timedelta(days=1)
  return dates

In [5]:
def generate_random_id(length=10):
    chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    used_ids = set()

    while True:
        id = ''.join(random.choice(chars) for _ in range(length))
        if id not in used_ids:
            used_ids.add(id)
            return id

In [6]:
def get_quarter(date):
  month = date.month
  return (month - 1) // 3 + 1


# Generation of customer datas

In [7]:
from random import randint, uniform
from faker.providers import BaseProvider

class CameroonPhoneProvider(BaseProvider):
    def phone_number(self):
        """Generates a fake Cameroon phone number in a random format."""
        formats = [
            "+237 65ABCDEFG",
            "+237 67ABCDEFG",
            "+237 68ABCDEFG",
            "+237 69ABCDEFG",
        ]
        format_str = formats[randint(0, len(formats) - 1)]
        phone_number = format_str.replace("A", str(randint(0, 9)))
        phone_number = phone_number.replace("B", str(randint(0, 9)))
        phone_number = phone_number.replace("C", str(randint(0, 9)))
        phone_number = phone_number.replace("D", str(randint(0, 9)))
        phone_number = phone_number.replace("E", str(randint(0, 9)))
        phone_number = phone_number.replace("F", str(randint(0, 9)))
        phone_number = phone_number.replace("G", str(randint(0, 9)))
        return phone_number

fake = Faker()
fake.add_provider(CameroonPhoneProvider)

cam_phone_number = fake.phone_number()
print(cam_phone_number)

+237 651107446


In [37]:
customers_list = []
for i in range(10000):
    first_name = fake.first_name()
    last_name = fake.last_name()
    domain = fake.domain_name()
    email = f"{first_name}.{last_name}@{domain}"
    phone_number = cam_phone_number
    registration_date = fake.date_between(start_date, end_date)
    date_of_birth = fake.date_of_birth(minimum_age=30, maximum_age=75)
    address = fake.address()
    city = fake.city()
    region = fake.state()

    date_dict = {
        'customer_id' :generate_random_id(), 
        'first_name': first_name, 
        'last_name': last_name,
        'email': email,
        'phone_number': phone_number,
        'registration_date': registration_date,
        'date_of_birth' : date_of_birth,
        'address': address,
        'city': city,
        'region' : region
        }
    customers_list.append(date_dict)
customer_df = pd.DataFrame(customers_list)
print(customer_df.shape)
customer_df.head()

(10000, 10)


Unnamed: 0,customer_id,first_name,last_name,email,phone_number,registration_date,date_of_birth,address,city,region
0,QJTJGbDHim,Jamie,Greene,Jamie.Greene@martinez.com,+237 651107446,2013-08-18,1985-04-04,"98881 Wagner Lodge Apt. 771\nNinahaven, VT 28747",New Elizabeth,Maine
1,UX9hc3qgGK,Christopher,Johnson,Christopher.Johnson@mccormick-evans.com,+237 651107446,2009-02-23,1980-08-16,"18767 Rogers Port\nJohnsonville, OH 47274",Ryanview,Texas
2,d9E7yLjEUN,Connie,Rose,Connie.Rose@gonzalez-baxter.com,+237 651107446,2009-01-05,1984-10-01,"9490 Cortez Parks Suite 827\nJessicastad, NV 8...",Russellshire,New Jersey
3,Rb44AZqdoL,Casey,Alvarez,Casey.Alvarez@rice.com,+237 651107446,2020-08-26,1979-08-06,Unit 3205 Box 2834\nDPO AE 20455,Smithstad,Arkansas
4,Lc0ff7cwUI,Joseph,Castillo,Joseph.Castillo@barber.info,+237 651107446,2008-02-09,1990-01-05,"961 Jennifer Underpass Suite 999\nSamuelbury, ...",Ayersstad,Indiana


In [39]:
customer_df.to_csv('datas/customers.csv')

# Generation of transactions datas

In [45]:
transaction_list=[]
customer_id_list = customer_df["customer_id"]

for i in range(1000000):
    transaction_id = generate_random_id()
    customer_id = random.choice(customer_id_list)
    transaction_type = random.choice(["retrait", "depôt", "transfert"])
    amount = uniform(100, 10000000)
    transaction_date = fake.date_between(start_date, end_date)
    location = fake.address()
    device_id = generate_random_id()
    status = random.choice(["échouée", "réussie"])
    is_fraud = random.choice([0,1])

    transaction_dict = {
        "transaction_id": transaction_id,
        "customer_id": customer_id,
        "transaction_type": transaction_type,
        "amount": amount,
        "transaction_date": transaction_date,
        "location": location,
        "device_id": device_id,
        "status": status,
        "is_fraud": is_fraud
    }

    transaction_list.append(transaction_dict)

transaction_df = pd.DataFrame(transaction_list)
print(transaction_df.shape)
transaction_df.head()

(1000000, 9)


Unnamed: 0,transaction_id,customer_id,transaction_type,amount,transaction_date,location,device_id,status,is_fraud
0,oxav9bIQh4,nTXcGeJqAU,depôt,7744813.0,2022-01-30,"184 Howard Grove Apt. 024\nWest Adrianview, IN...",1ujyu4xnjO,échouée,1
1,jAT128rSJg,f987txOSOb,transfert,1107648.0,2015-04-20,"09735 Lucas Highway Suite 398\nNorth David, MP...",kiKLEpSNqY,réussie,0
2,8mYo7VZkH9,izKip3Gtxj,retrait,3610985.0,2011-08-22,"549 Lang Squares Suite 656\nNew Jennifer, CA 9...",5jUyw2OfDE,échouée,0
3,VVI7rSeQfr,JJu48ugNT1,transfert,1183076.0,2014-04-01,"51639 Miller Plains\nWest Sara, ID 68248",Zr4wC46rq5,échouée,0
4,Bx7ajoEIPM,Ycxf7FDuQ4,depôt,9359868.0,2020-07-29,"2351 James Shore\nNorth Michelleport, WI 17542",U9Sxf3cxRn,échouée,1


In [51]:
transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   transaction_id    1000000 non-null  object 
 1   customer_id       1000000 non-null  object 
 2   transaction_type  1000000 non-null  object 
 3   amount            1000000 non-null  float64
 4   transaction_date  1000000 non-null  object 
 5   location          1000000 non-null  object 
 6   device_id         1000000 non-null  object 
 7   status            1000000 non-null  object 
 8   is_fraud          1000000 non-null  int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 68.7+ MB


In [46]:
transaction_df.to_csv('datas/transactions.csv')

# Generate transaction history

In [47]:
transaction_history_list = []

for i, transaction in transaction_df.iterrows():
    history_id = generate_random_id()
    customer_id = transaction.customer_id
    transaction_id = transaction.transaction_id
    previous_balance = transaction.amount + uniform(0, 10000000)
    new_balance = previous_balance - transaction.amount
    transaction_date = transaction.transaction_date

    transaction_history_dict = {
        "history_id" : history_id,
        "customer_id" : customer_id,
        "transaction_id" : transaction_id,
        "previous_balance" : previous_balance,
        "new_balance" : new_balance,
        "transaction_date": transaction_date
    }
    transaction_history_list.append(transaction_history_dict)

transaction_history_df = pd.DataFrame(transaction_history_list)
print(transaction_history_df.shape)
transaction_history_df.head()

(1000000, 6)


Unnamed: 0,history_id,customer_id,transaction_id,previous_balance,new_balance,transaction_date
0,TojJQiFoTe,nTXcGeJqAU,oxav9bIQh4,14610270.0,6865461.0,2022-01-30
1,ZAYIf3hsJM,f987txOSOb,jAT128rSJg,2788819.0,1681171.0,2015-04-20
2,6iIk8wSjeS,izKip3Gtxj,8mYo7VZkH9,9837023.0,6226037.0,2011-08-22
3,Kc1VayUUhb,JJu48ugNT1,VVI7rSeQfr,7938878.0,6755802.0,2014-04-01
4,a5CdS2J5I2,Ycxf7FDuQ4,Bx7ajoEIPM,10247950.0,888080.9,2020-07-29


In [48]:
transaction_history_df.to_csv('datas/transaction_history.csv')

In [50]:
transaction_history_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   history_id        1000000 non-null  object 
 1   customer_id       1000000 non-null  object 
 2   transaction_id    1000000 non-null  object 
 3   previous_balance  1000000 non-null  float64
 4   new_balance       1000000 non-null  float64
 5   transaction_date  1000000 non-null  object 
dtypes: float64(2), object(4)
memory usage: 45.8+ MB
