In [2]:
#import libraries
import pandas as pd
import random #use to generate data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [17]:
# --- Constants and Configuration ---
CROP_VALUES = {
    'Soja': 3000, 'Milho': 2500, 'Arroz': 2000,
    'Feijão': 3500, 'Trigo': 1800, 'Aveia': 1500
}
JUNE_CROPS = ['Soja', 'Milho', 'Arroz', 'Feijão']
JANUARY_CROPS = ['Trigo', 'Aveia']
ALL_CROPS = JUNE_CROPS + JANUARY_CROPS
LOAN_TERMS = [90, 180, 360, 720]
MAX_LOANS_PER_YEAR = 3
MIN_LOAN_INTERVAL_DAYS = 90 # Minimum days between loans for the same client

# --- Helper Functions ---

def _calculate_loan_amount(hectar, culture):
    """Calculates loan amount based on hectares and crop type."""
    return round(hectar * CROP_VALUES.get(culture, 0), 2)

def _determine_rating(score):
    """Determines credit rating based on FICO score."""
    if score >= 750:
        return 5  # AAA
    elif score >= 700:
        return 4  # AA
    elif score >= 650:
        return 3  # A
    elif score >= 600:
        return 2  # B
    elif score >= 500:
        return 1  # C
    else:
        return 0  # D (High Risk)

def _simulate_score_change(current_score, had_previous_default):
    """Simulates score change based on previous default status."""
    if had_previous_default:
        # Score drops more significantly after a default
        score_change = random.randint(-120, -10) # Can drop by up to 120 points
    else:
        # Normal fluctuation, slight chance of increase/decrease
        score_change = random.randint(-20, 40) # Can increase slightly more than decrease

    new_score = max(300, min(850, current_score + score_change))
    return new_score

def _determine_default_status(score):
    """Determines default status based on score with some randomness."""
    default_probability = 0
    if score < 500:
        default_probability = 0.75
    elif score < 600:
        default_probability = 0.45
    elif score < 650:
        default_probability = 0.15
    else:
        default_probability = 0.03 # Very low chance for high scores

    return 1 if random.random() < default_probability else 0

# --- Main Data Generation Functions ---

def generate_first_transactions(num_unique_clients: int):
    """
    Generates the first transaction for a specified number of unique clients.
    Returns a list of dictionaries and a dictionary of client states.
    """
    first_transactions_data = []
    client_states = {} # To track score, last_default_status, last_loan_date, loans_this_year

    for i in range(num_unique_clients):
        client_id = f"CLIENT_{i:04d}"
        initial_score = random.randint(550, 800) # Initial random score for a new client

        # Randomize initial loan date over a few years
        loan_date = datetime(2020 + random.randint(0, 3), random.choice([1, 6]), random.randint(1, 28))

        income = round(random.uniform(5000, 500000), 2)
        culture = random.choice(ALL_CROPS)
        hectar = round(random.uniform(10, 500), 2)
        loan_amount = _calculate_loan_amount(hectar, culture)
        loan_amount_term = random.choice(LOAN_TERMS)
        rating = _determine_rating(initial_score)
        total_debt = round(random.uniform(loan_amount, min(loan_amount * 2, income * 3)), 2)
        default_status = _determine_default_status(initial_score)

        first_transactions_data.append({
            'id_client': client_id,
            'loan_date': loan_date.strftime('%Y-%m-%d'), # Store as string for CSV/DataFrame
            'income': income,
            'culture': culture,
            'hectar': hectar,
            'loan_amount': loan_amount,
            'loan_amount_term': loan_amount_term,
            'rating': rating,
            'score': initial_score,
            'total_debt': total_debt,
            'default_status': default_status,
            'transaction_type': 'first'
        })

        # Initialize client state
        client_states[client_id] = {
            'score': initial_score,
            'last_default_status': bool(default_status), # Convert 0/1 to False/True
            'last_loan_date': loan_date,
            'loans_this_year': {loan_date.year: 1} # Track loans per year
        }
    return first_transactions_data, client_states

def generate_historic_transactions(
    first_transactions_data: list,
    client_states: dict,
    num_total_rows: int
):
    """
    Generates additional historic transactions based on first transactions,
    applying score reduction logic and loan frequency limits.
    """
    all_transactions_data = list(first_transactions_data) # Start with first transactions
    
    # Create a list of client IDs that can have more transactions
    eligible_clients = list(client_states.keys())

    # Generate remaining rows
    for _ in range(num_total_rows - len(first_transactions_data)):
        if not eligible_clients:
            # If all clients have reached their max loans or dates are too far, break
            print("Warning: Not enough eligible clients to reach desired row count.")
            break

        client_id = random.choice(eligible_clients)
        client_state = client_states[client_id]

        # Determine next loan date, respecting 3 loans/year and minimum interval
        current_year = client_state['last_loan_date'].year
        current_month = client_state['last_loan_date'].month
        
        # Increment loan date by at least MIN_LOAN_INTERVAL_DAYS
        proposed_loan_date = client_state['last_loan_date'] + timedelta(days=random.randint(MIN_LOAN_INTERVAL_DAYS, 180)) # Random interval up to 6 months

        # Check if proposed date is in a new year or if current year loans exceed limit
        loans_in_current_year = client_state['loans_this_year'].get(proposed_loan_date.year, 0)

        if proposed_loan_date.year == current_year and loans_in_current_year >= MAX_LOANS_PER_YEAR:
            # If max loans for current year, move to next year
            proposed_loan_date = datetime(current_year + 1, random.choice([1, 6]), random.randint(1, 28))
            client_state['loans_this_year'][proposed_loan_date.year] = 1 # Reset count for new year
        elif proposed_loan_date.year > current_year:
            # New year, reset count
            client_state['loans_this_year'][proposed_loan_date.year] = 1
        else:
            # Same year, increment count
            client_state['loans_this_year'][proposed_loan_date.year] = loans_in_current_year + 1

        loan_date = proposed_loan_date

        # Simulate score based on previous default status
        current_score = client_state['score']
        new_score = _simulate_score_change(current_score, client_state['last_default_status'])
        
        # Update client's score for future transactions
        client_state['score'] = new_score

        # Generate other transaction details
        income = round(random.uniform(5000, 500000), 2) # Income can fluctuate
        culture = random.choice(ALL_CROPS)
        hectar = round(random.uniform(10, 500), 2)
        loan_amount = _calculate_loan_amount(hectar, culture)
        loan_amount_term = random.choice(LOAN_TERMS)
        rating = _determine_rating(new_score)
        total_debt = round(random.uniform(loan_amount, min(loan_amount * 2, income * 3)), 2)
        default_status = _determine_default_status(new_score)

        # Update client's last default status and loan date
        client_state['last_default_status'] = bool(default_status)
        client_state['last_loan_date'] = loan_date

        all_transactions_data.append({
            'id_client': client_id,
            'loan_date': loan_date.strftime('%Y-%m-%d'),
            'income': income,
            'culture': culture,
            'hectar': hectar,
            'loan_amount': loan_amount,
            'loan_amount_term': loan_amount_term,
            'rating': rating,
            'score': new_score,
            'total_debt': total_debt,
            'default_status': default_status,
            'transaction_type': 'historic'
        })
        
        # If a client has reached their max loans for the latest year, they might become less eligible
        # This simple check removes them from eligible_clients for a bit,
        # but the date advancement logic above already handles it sufficiently.
        # For a truly complex simulation, you might need a more sophisticated client selection.
        # For now, the date logic ensures compliance.

    return all_transactions_data

# --- Execution ---

def main(num_total_rows: int = 20000, num_unique_clients: int = 1000):
    """
    Main function to orchestrate data generation and save to CSVs.
    """
    print(f"Generating {num_unique_clients} first transactions...")
    first_transactions_list, client_states = generate_first_transactions(num_unique_clients)
    df_first_transactions = pd.DataFrame(first_transactions_list)

    print(f"Generating {num_total_rows - num_unique_clients} historic transactions...")
    all_transactions_list = generate_historic_transactions(
        first_transactions_list,
        client_states,
        num_total_rows
    )
    df_historic_transactions = pd.DataFrame(all_transactions_list)

    # Sort historic transactions by client and date to ensure chronological order
    df_historic_transactions['loan_date'] = pd.to_datetime(df_historic_transactions['loan_date'])
    df_historic_transactions = df_historic_transactions.sort_values(by=['id_client', 'loan_date']).reset_index(drop=True)

    # Save to CSV files
    # df_first_transactions.to_csv('first_transactions.csv', index=False)
    # df_historic_transactions.to_csv('historic_transactions.csv', index=False)

    print("\nData generation complete!")
    print(f"'{len(df_first_transactions)}' rows saved to 'first_transactions.csv'")
    print(f"'{len(df_historic_transactions)}' rows saved to 'historic_transactions.csv'")

    # Display some info
    print("\n--- First Transactions Sample ---")
    print(df_first_transactions.head())
    print("\n--- Historic Transactions Sample ---")
    print(df_historic_transactions.head())
    print("\n--- Historic Transactions Info ---")
    print(df_historic_transactions.info())
    print("\n--- Client Loan Counts in Historic Transactions ---")
    print(df_historic_transactions.groupby('id_client').size().describe())
    print("\n--- Score Distribution by Default Status (Historic) ---")
    print(df_historic_transactions.groupby('default_status')['score'].describe())

    return df_first_transactions, df_historic_transactions


In [18]:
# Generate 20,000 rows
# You can adjust the total number of rows and unique clients here
df_first, df_historic = main(num_total_rows=20000, num_unique_clients=8000)


Generating 8000 first transactions...
Generating 12000 historic transactions...

Data generation complete!
'8000' rows saved to 'first_transactions.csv'
'20000' rows saved to 'historic_transactions.csv'

--- First Transactions Sample ---
     id_client   loan_date     income culture  hectar  loan_amount  \
0  CLIENT_0000  2022-06-23  481673.33    Soja  273.42     820260.0   
1  CLIENT_0001  2021-06-22   78534.28   Arroz  457.94     915880.0   
2  CLIENT_0002  2021-01-09   88055.23   Trigo  158.56     285408.0   
3  CLIENT_0003  2022-01-12  107281.64   Arroz  249.49     498980.0   
4  CLIENT_0004  2023-06-16  304857.51   Aveia  104.85     157275.0   

   loan_amount_term  rating  score  total_debt  default_status  \
0               180       4    749  1161296.80               0   
1               720       1    596   349887.35               0   
2                90       1    596   268143.79               0   
3                90       1    596   475949.30               1   
4          

In [19]:
# df_credit.describe()
# df_credit.head()
df_historic.head()

Unnamed: 0,id_client,loan_date,income,culture,hectar,loan_amount,loan_amount_term,rating,score,total_debt,default_status,transaction_type
0,CLIENT_0000,2022-06-23,481673.33,Soja,273.42,820260.0,180,4,749,1161296.8,0,first
1,CLIENT_0000,2022-11-29,138412.89,Arroz,266.56,533120.0,720,4,735,530900.78,0,historic
2,CLIENT_0000,2023-03-08,203074.32,Milho,217.22,543050.0,360,4,716,590322.26,0,historic
3,CLIENT_0001,2021-06-22,78534.28,Arroz,457.94,915880.0,720,1,596,349887.35,0,first
4,CLIENT_0001,2021-12-11,398756.47,Feijão,324.67,1136345.0,720,1,580,1171832.56,1,historic


In [17]:
# df_credit_clear = df_credit.query("score != null or endividamento_curto != '' or endividamento_medio != '' or endividamento_longo != ''")
# clear dataset to remove null data
df_credit_clear = df_credit.dropna()

df_credit_clear.count()

id_client           20000
income              20000
culture             20000
hectar              20000
loan_amount         20000
loan_amount_term    20000
rating              20000
score               20000
total_debt          20000
default_status      20000
dtype: int64

In [9]:
df_credit_clear.sort_values(by="rating", ascending=True).head()
# df_credit_clear['amount'].describe()
df_credit_clear.columns

Index(['id_client', 'culture', 'hectar', 'parcelas', 'rating', 'score',
       'endividamento_curto', 'endividamento_medio', 'endividamento_longo',
       'amount', 'loan_date', 'inadimplencia'],
      dtype='object')

In [10]:
#Split dataset into features and target variable
dt_x = df_credit_clear.iloc[:,2:-1]
dt_y = df_credit_clear.iloc[:,-1]

In [11]:
#Split the dataset into training and testing sets
dt_x_train, dt_x_test, dt_y_train, dt_y_test = train_test_split(dt_x, dt_y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
# Fit the scaler on the training data
dt_x_train_scaled = pd.DataFrame( scaler.fit_transform(dt_x_train), columns=dt_x_train.columns) 
dt_x_test_scaled = pd.DataFrame( scaler.transform(dt_x_test), columns=dt_x_test.columns)

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [18]:

pca = PCA(n_components=8, random_state=8)
pca.fit(dt_x_train_scaled)
pca.explained_variance_ratio_

array([0.22814187, 0.12983052, 0.12828896, 0.12580436, 0.12264362,
       0.12213985, 0.1211973 , 0.02195353])

In [19]:
pca = PCA(n_components=3, random_state=10)
pca.fit(dt_x_train_scaled)

dt_x_train_pca = pca.transform(dt_x_train_scaled)
dt_x_test_pca = pca.transform(dt_x_test_scaled)

In [20]:
print(dt_x_train_pca.shape)
dt_x_test_pca.shape


(12372, 3)


(3094, 3)

In [21]:
max_depths_list = [5, None]
min_samples_split_list = [2,10]

parameter_grid = {
    'max_depth': max_depths_list,
    'min_samples_split': min_samples_split_list
}


estimator = DecisionTreeClassifier(random_state=10)
grid = GridSearchCV(estimator, param_grid=parameter_grid, scoring='accuracy' , cv=5)
grid.fit(dt_x_train_pca, dt_y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=10)
,param_grid,"{'max_depth': [5, None], 'min_samples_split': [2, 10]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [22]:
estimator = DecisionTreeClassifier(max_depth=5, min_samples_split=2, random_state=10)

essemble = BaggingClassifier(estimator=estimator, n_estimators=100, random_state=10)
essemble.fit(dt_x_train_pca, dt_y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=10)
,n_estimators,100
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,10

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
predictions = essemble.predict(dt_x_test_pca)
print(accuracy_score(dt_y_test, predictions))

0.8729799612152553
