In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [3]:
import pandas as pd
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Generate Loan Data
def generate_loan_data_with_faker(num_customers):
    data = {
        "customer_id": [f"C{i+1}" for i in range(num_customers)],
        "age": [random.randint(18, 70) for _ in range(num_customers)],
        "income": [random.randint(15000, 200000) for _ in range(num_customers)],
        "credit_score": [random.randint(300, 900) for _ in range(num_customers)],
        "loan_amount": [random.randint(5000, 100000) for _ in range(num_customers)],
        "interest_rate": [round(random.uniform(3.50, 6.25), 2) for _ in range(num_customers)],
        "loan_term": [random.choice([12, 24, 36, 48, 60]) for _ in range(num_customers)],
    }
    df = pd.DataFrame(data)
    df["repayment_status"] = df.apply(
        lambda x: 1 if x["credit_score"] < 700 or x["income"] < 75000 else 0,
        axis=1,
    )
    return df

# Generate Transaction Data
def generate_transaction_data_with_faker(num_transactions, num_customers):
    data = {
        "customer_id": [f"C{random.randint(1, num_customers)}" for _ in range(num_transactions)],
        "transaction_id": [fake.uuid4() for _ in range(num_transactions)],
        "transaction_amount": [round(random.uniform(10, 5000), 2) for _ in range(num_transactions)],
        "transaction_type": [random.choice(["deposit","withdrawal","Purchases","Credit Transaction","Cash Transaction",]) for _ in range(num_transactions)],
        "transaction_date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(num_transactions)],
    }
    return pd.DataFrame(data)



# Generate Recommendation Data
def generate_recommendation_data_with_faker(num_interactions, num_customers):
    interaction_types = ["viewed", "clicked", "purchased"]

    viewed_options = [
        "Balance", "Transaction History", "Loan Balance", "EMI Schedule", "Fixed Deposit Details",
        "Portfolio", "Policy Details", "Bill Payment History", "Exchange Rates", "Spending Analysis"
    ]

    clicked_options = [
        "Account Details", "Statement Download", "Apply for a Loan", "EMI Calculator", "Open Fixed Deposit",
        "Mutual Fund Details", "Renew Policy", "Pay Bills", "Transfer Funds", "Set Goals"
    ]

    purchased_options = [
        "Savings Account", "Fixed Deposit", "Personal Loan", "Home Loan", "Credit Card",
        "Mutual Funds", "Life Insurance", "Health Insurance", "Forex Card", "Recurring Deposit"
    ]

    data = {
        "customer_id": [f"C{random.randint(1, num_customers)}" for _ in range(num_interactions)],
        "product_id": [fake.uuid4() for _ in range(num_interactions)],
        "interaction_type": [random.choice(interaction_types) for _ in range(num_interactions)],
        "interaction_date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(num_interactions)],
    }

    interaction_details = []
    for interaction in data["interaction_type"]:
        if interaction == "viewed":
            interaction_details.append(random.choice(viewed_options))
        elif interaction == "clicked":
            interaction_details.append(random.choice(clicked_options))
        elif interaction == "purchased":
            interaction_details.append(random.choice(purchased_options))

    data["interaction_details"] = interaction_details
    return pd.DataFrame(data)


# Create datasets
num_customers = 1000
num_transactions = 6000
num_interactions = 10000

loan_data = generate_loan_data_with_faker(num_customers)
transaction_data = generate_transaction_data_with_faker(num_transactions, num_customers)
recommendation_data = generate_recommendation_data_with_faker(num_interactions, num_customers)


In [5]:
print(loan_data.shape)
print(transaction_data.shape)
print(recommendation_data.shape)

(1000, 8)
(6000, 5)
(10000, 5)


Loan Data

In [7]:
loan_data

Unnamed: 0,customer_id,age,income,credit_score,loan_amount,interest_rate,loan_term,repayment_status
0,C1,64,98519,713,76289,5.01,36,0
1,C2,55,93730,741,94477,6.00,60,0
2,C3,53,75185,714,59718,4.04,24,0
3,C4,25,138150,705,83274,5.63,48,0
4,C5,53,150328,664,95644,5.67,48,1
...,...,...,...,...,...,...,...,...
995,C996,47,58043,461,43639,3.80,48,1
996,C997,34,159429,404,45723,5.51,36,1
997,C998,24,32524,524,36557,4.56,24,1
998,C999,32,178555,768,52603,4.53,24,0


In [9]:
loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customer_id       1000 non-null   object 
 1   age               1000 non-null   int64  
 2   income            1000 non-null   int64  
 3   credit_score      1000 non-null   int64  
 4   loan_amount       1000 non-null   int64  
 5   interest_rate     1000 non-null   float64
 6   loan_term         1000 non-null   int64  
 7   repayment_status  1000 non-null   int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 62.6+ KB


In [11]:
loan_data.describe()

Unnamed: 0,age,income,credit_score,loan_amount,interest_rate,loan_term,repayment_status
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,43.692,106980.339,602.085,51678.351,4.89809,36.972,0.762
std,15.607778,53226.062005,176.733213,28353.018994,0.797496,16.904348,0.426072
min,18.0,15127.0,300.0,5071.0,3.5,12.0,0.0
25%,30.0,60473.25,447.75,27282.5,4.2,24.0,1.0
50%,43.0,108973.5,596.5,49030.0,4.89,36.0,1.0
75%,58.0,151962.75,762.0,78385.75,5.5825,48.0,1.0
max,70.0,199850.0,900.0,99905.0,6.25,60.0,1.0
