In [5]:
#import libraries
import pandas as pd
import random #use to generate data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

In [14]:
# --- Constants and Configuration ---
CROP_VALUES = {
    'Soja': 3000, 'Milho': 2500, 'Arroz': 2000,
    'Feijão': 3500, 'Trigo': 1800, 'Aveia': 1500
}
JUNE_CROPS = ['Soja', 'Milho', 'Arroz', 'Feijão']
JANUARY_CROPS = ['Trigo', 'Aveia']
ALL_CROPS = JUNE_CROPS + JANUARY_CROPS
LOAN_TERMS = [90, 180, 360] # Updated loan terms as per new requirements
MAX_LOANS_PER_YEAR = 3
MIN_LOAN_INTERVAL_DAYS = 30 # Minimum days between loans for the same client

# Hectar Constants
MIN_TOTAL_HECTAR_CLIENT = 500  # Minimum total hectares a client might own
MAX_TOTAL_HECTAR_CLIENT = 5000 # Maximum total hectares a client might own
# Percentage of client's total_hectares_client that can be taken for a SINGLE loan
MIN_LOAN_HECTAR_PORTION = 0.05
MAX_LOAN_HECTAR_PORTION = 0.50

# Maximum attempts to find an eligible client for a new loan in historic generation
MAX_ATTEMPTS_TO_FIND_CLIENT = 1000

# --- Helper Functions ---

def _calculate_loan_amount(hectar, culture):
    """Calculates loan amount based on hectares and crop type."""
    return round(hectar * CROP_VALUES.get(culture, 0), 2)

def _determine_rating(score):
    """Determines credit rating based on FICO score."""
    if score >= 750:
        return 5  # AAA
    elif score >= 700:
        return 4  # AA
    elif score >= 650:
        return 3  # A
    elif score >= 600:
        return 2  # B
    elif score >= 500:
        return 1  # C
    else:
        return 0  # D (High Risk)

def _simulate_score_change(current_score, had_previous_default):
    """Simulates score change for the NEXT loan based on previous default status."""
    if had_previous_default:
        # Score drops more significantly after a default
        score_change = random.randint(-120, -10) # Can drop by up to 120 points
    else:
        # Normal fluctuation, slight chance of increase/decrease
        score_change = random.randint(-20, 40) # Can increase slightly more than decrease

    new_score = max(300, min(850, current_score + score_change))
    return new_score

def _simulate_payment_delay_days(score, had_previous_default):
    """
    Simulates the number of days payment is delayed (can be negative for early payment).
    Lower score and previous default increase chance of delay.
    """
    delay_prob = 0
    if score < 500:
        delay_prob = 0.75 # High chance of significant delay
    elif score < 600:
        delay_prob = 0.45 # Moderate chance
    elif score < 650:
        delay_prob = 0.15 # Low chance
    else:
        delay_prob = 0.03 # Very low chance

    if had_previous_default:
        delay_prob = min(1.0, delay_prob * 1.5) # Increase delay probability if previous default

    if random.random() < delay_prob:
        # Client delays payment
        if score < 500:
            return random.randint(15, 90) # Significant delay
        elif score < 650:
            return random.randint(5, 45) # Moderate delay
        else:
            return random.randint(1, 10) # Small delay
    else:
        # Client pays on time or early
        return random.randint(-10, 3) # Up to 10 days early, or 3 days late (still considered on time for this logic)

# --- Main Data Generation Functions ---

def generate_first_transactions(num_unique_clients: int):
    """
    Generates the first transaction for a specified number of unique clients.
    Returns a list of dictionaries and a dictionary of client states.
    """
    first_transactions_data = []
    # client_states now tracks hectares_used per season and their due dates
    client_states = {} 

    for i in range(num_unique_clients):
        client_id = f"CLIENT_{i:04d}"
        initial_score = random.randint(550, 800)
        total_hectares_client = round(random.uniform(MIN_TOTAL_HECTAR_CLIENT, MAX_TOTAL_HECTAR_CLIENT), 2)

        # Randomize initial loan date over a few years, ensuring it aligns with a crop month
        start_year = 2020 + random.randint(0, 3)
        initial_month = random.choice([1, 6]) # January or June
        loan_date = datetime(start_year, initial_month, random.randint(1, 28))

        income = round(random.uniform(5000, 500000), 2)
        culture = random.choice(JUNE_CROPS if initial_month == 6 else JANUARY_CROPS)

        # Hectar for this loan is a portion of total_hectares_client
        hectar = round(random.uniform(total_hectares_client * MIN_LOAN_HECTAR_PORTION,
                                     total_hectares_client * MAX_LOAN_HECTAR_PORTION), 2)
        hectar = max(1.0, min(hectar, total_hectares_client)) # Ensure hectar is valid

        loan_amount = _calculate_loan_amount(hectar, culture)
        loan_amount_term = random.choice(LOAN_TERMS)
        loan_due_date = loan_date + timedelta(days=loan_amount_term)

        # Simulate payment delay/early payment
        payment_delay_days = _simulate_payment_delay_days(initial_score, False) # No previous default for first loan
        payment_date = loan_due_date + timedelta(days=payment_delay_days)

        # Determine default status based on payment date vs due date
        default_status = 1 if payment_date > loan_due_date else 0

        rating = _determine_rating(initial_score)
        total_debt = round(random.uniform(loan_amount, min(loan_amount * 2, income * 3)), 2)

        first_transactions_data.append({
            'id_client': client_id,
            'loan_date': loan_date.strftime('%Y-%m-%d'),
            'loan_due_date': loan_due_date.strftime('%Y-%m-%d'),
            'payment_date': payment_date.strftime('%Y-%m-%d'),
            'income': income,
            'culture': culture,
            'hectar': hectar,
            'total_hectares_client': total_hectares_client,
            'loan_amount': loan_amount,
            'loan_amount_term': loan_amount_term,
            'rating': rating,
            'score': initial_score,
            'total_debt': total_debt,
            'default_status': default_status,
            'transaction_type': 'first'
        })

        # Initialize client state for tracking hectares used per season
        client_states[client_id] = {
            'score': initial_score,
            'last_default_status': bool(default_status),
            'last_loan_date': loan_date,
            'loans_this_year': {loan_date.year: 1},
            'total_hectares_client': total_hectares_client,
            'hectares_used_june_season': hectar if culture in JUNE_CROPS else 0,
            'june_loan_due_date': loan_due_date if culture in JUNE_CROPS else None,
            'hectares_used_january_season': hectar if culture in JANUARY_CROPS else 0,
            'january_loan_due_date': loan_due_date if culture in JANUARY_CROPS else None,
        }
    return first_transactions_data, client_states

def generate_historic_transactions(
    first_transactions_data: list,
    client_states: dict,
    num_total_rows: int
):
    """
    Generates additional historic transactions, applying all new rules.
    """
    all_transactions_data = list(first_transactions_data)
    eligible_clients = list(client_states.keys())
    transactions_generated = len(first_transactions_data)
    
    attempts = 0 # To prevent infinite loops if clients run out of options

    while transactions_generated < num_total_rows and attempts < MAX_ATTEMPTS_TO_FIND_CLIENT * (num_total_rows - transactions_generated):
        client_id = random.choice(eligible_clients)
        client_state = client_states[client_id]
        
        # Advance loan_date realistically, ensuring it's after the last loan and in a valid month
        # Start a new transaction 1 month to 1 year after the last loan, plus random days
        base_future_date = client_state['last_loan_date'] + timedelta(days=random.randint(MIN_LOAN_INTERVAL_DAYS, 365))
        
        # Determine the month for the next loan (June or January)
        next_loan_month = random.choice([1, 6])
        
        # Set the loan_date to the determined month in the base_future_date's year
        loan_date = datetime(base_future_date.year, next_loan_month, random.randint(1, 28))

        # Ensure loan_date is always after the last loan for the same client
        if loan_date <= client_state['last_loan_date']:
            # If generated date is not sufficiently after last loan, push it forward
            loan_date = client_state['last_loan_date'] + timedelta(days=MIN_LOAN_INTERVAL_DAYS)
            # Re-adjust month if it changed and needs to be January/June
            if loan_date.month not in [1, 6]:
                loan_date = datetime(loan_date.year, next_loan_month, random.randint(1, 28))
                if loan_date <= client_state['last_loan_date']: # Ensure it's still after if year changed
                    loan_date = datetime(loan_date.year + 1, next_loan_month, random.randint(1, 28))

        # --- Free up hectares from matured loans ---
        if client_state['june_loan_due_date'] and loan_date > client_state['june_loan_due_date']:
            client_state['hectares_used_june_season'] = 0
            client_state['june_loan_due_date'] = None
        if client_state['january_loan_due_date'] and loan_date > client_state['january_loan_due_date']:
            client_state['hectares_used_january_season'] = 0
            client_state['january_loan_due_date'] = None

        # --- Check loan limits for the year ---
        current_year = loan_date.year
        client_state['loans_this_year'].setdefault(current_year, 0) # Ensure year is in dict

        if client_state['loans_this_year'][current_year] >= MAX_LOANS_PER_YEAR:
            attempts += 1
            continue # Try next client/iteration

        # --- Determine available hectares for new loan based on season ---
        culture = random.choice(JUNE_CROPS if loan_date.month == 6 else JANUARY_CROPS)
        is_june_season_loan = culture in JUNE_CROPS
        
        if is_june_season_loan:
            available_hectares_for_this_loan = client_state['total_hectares_client'] - client_state['hectares_used_june_season']
            # If there's an active loan for the same season, ensure it's truly free
            if client_state['hectares_used_june_season'] > 0 and client_state['june_loan_due_date'] and loan_date <= client_state['june_loan_due_date']:
                # Loan for this season is still active, cannot take another one for the same season
                attempts += 1
                continue
        else: # January season loan
            available_hectares_for_this_loan = client_state['total_hectares_client'] - client_state['hectares_used_january_season']
            # If there's an active loan for the same season, ensure it's truly free
            if client_state['hectares_used_january_season'] > 0 and client_state['january_loan_due_date'] and loan_date <= client_state['january_loan_due_date']:
                # Loan for this season is still active, cannot take another one for the same season
                attempts += 1
                continue
        
        if available_hectares_for_this_loan <= 0.001: # Check for negligible hectares
            attempts += 1
            continue # Not enough hectares available for this client right now

        # --- Determine hectar for the new loan ---
        hectar = round(random.uniform(available_hectares_for_this_loan * MIN_LOAN_HECTAR_PORTION,
                                     available_hectares_for_this_loan * MAX_LOAN_HECTAR_PORTION), 2)
        hectar = max(1.0, min(hectar, available_hectares_for_this_loan)) # Ensure valid range

        if hectar < 1.0: # Minimum hectare for a loan
            attempts += 1
            continue

        # --- Generate other transaction details ---
        current_score_for_this_loan = _simulate_score_change(client_state['score'], client_state['last_default_status'])
        client_state['score'] = current_score_for_this_loan # Update client's score for future transactions

        income = round(random.uniform(5000, 500000), 2)
        loan_amount = _calculate_loan_amount(hectar, culture)
        loan_amount_term = random.choice(LOAN_TERMS)
        loan_due_date = loan_date + timedelta(days=loan_amount_term)

        payment_delay_days = _simulate_payment_delay_days(current_score_for_this_loan, client_state['last_default_status'])
        payment_date = loan_due_date + timedelta(days=payment_delay_days)

        default_status = 1 if payment_date > loan_due_date else 0
        rating = _determine_rating(current_score_for_this_loan)
        total_debt = round(random.uniform(loan_amount, min(loan_amount * 2, income * 3)), 2)

        # --- Update client state after successful loan generation ---
        client_state['last_default_status'] = bool(default_status)
        client_state['last_loan_date'] = loan_date
        client_state['loans_this_year'][current_year] += 1

        if is_june_season_loan:
            client_state['hectares_used_june_season'] = hectar
            client_state['june_loan_due_date'] = loan_due_date
        else:
            client_state['hectares_used_january_season'] = hectar
            client_state['january_loan_due_date'] = loan_due_date

        all_transactions_data.append({
            'id_client': client_id,
            'loan_date': loan_date.strftime('%Y-%m-%d'),
            'loan_due_date': loan_due_date.strftime('%Y-%m-%d'),
            'payment_date': payment_date.strftime('%Y-%m-%d'),
            'income': income,
            'culture': culture,
            'hectar': hectar,
            'total_hectares_client': client_state['total_hectares_client'],
            'loan_amount': loan_amount,
            'loan_amount_term': loan_amount_term,
            'rating': rating,
            'score': current_score_for_this_loan,
            'total_debt': total_debt,
            'default_status': default_status,
            'transaction_type': 'historic'
        })
        transactions_generated += 1
        attempts = 0 # Reset attempts after a successful transaction

    if transactions_generated < num_total_rows:
        print(f"Warning: Could only generate {transactions_generated} rows out of {num_total_rows} due to client eligibility constraints and max attempts.")

    return all_transactions_data

# --- Execution ---

def main(num_total_rows: int = 20000, num_unique_clients: int = 1000):
    """
    Main function to orchestrate data generation and return DataFrames.
    """
    print(f"Generating {num_unique_clients} first transactions...")
    first_transactions_list, client_states = generate_first_transactions(num_unique_clients)
    df_first_transactions = pd.DataFrame(first_transactions_list)

    print(f"Generating {num_total_rows - num_unique_clients} historic transactions...")
    all_transactions_list = generate_historic_transactions(
        first_transactions_list,
        client_states,
        num_total_rows
    )
    df_historic_transactions = pd.DataFrame(all_transactions_list)

    # Convert date columns to datetime objects for proper sorting and analysis
    df_historic_transactions['loan_date'] = pd.to_datetime(df_historic_transactions['loan_date'])
    df_historic_transactions['loan_due_date'] = pd.to_datetime(df_historic_transactions['loan_due_date'])
    df_historic_transactions['payment_date'] = pd.to_datetime(df_historic_transactions['payment_date'])
    
    # Sort historic transactions by client and date to ensure chronological order
    df_historic_transactions = df_historic_transactions.sort_values(by=['id_client', 'loan_date']).reset_index(drop=True)

    print("\nData generation complete!")
    print(f"First transactions DataFrame has '{len(df_first_transactions)}' rows.")
    print(f"Historic transactions DataFrame has '{len(df_historic_transactions)}' rows.")

    # Display some info
    print("\n--- First Transactions Sample ---")
    print(df_first_transactions.head())
    print("\n--- Historic Transactions Sample ---")
    print(df_historic_transactions.head())
    print("\n--- Historic Transactions Info ---")
    print(df_historic_transactions.info())
    print("\n--- Client Loan Counts in Historic Transactions ---")
    print(df_historic_transactions.groupby('id_client').size().describe())
    print("\n--- Default Status Distribution (Historic) ---")
    print(df_historic_transactions['default_status'].value_counts(normalize=True))
    print("\n--- Average Score by Default Status (Historic) ---")
    print(df_historic_transactions.groupby('default_status')['score'].mean())
    print("\n--- Average Payment Delay (Days) by Default Status (Historic) ---")
    df_historic_transactions['payment_delay_days'] = (df_historic_transactions['payment_date'] - df_historic_transactions['loan_due_date']).dt.days
    print(df_historic_transactions.groupby('default_status')['payment_delay_days'].mean())
    print("\n--- Hectar Usage Statistics (Historic) ---")
    print(df_historic_transactions['hectar'].describe())
    print(f"Average percentage of total hectares loaned per transaction: {((df_historic_transactions['hectar'] / df_historic_transactions['total_hectares_client']) * 100).mean():.2f}%")


    return df_first_transactions, df_historic_transactions

In [15]:
# Generate 20,000 rows
# You can adjust the total number of rows and unique clients here
df_first, df_historic = main(num_total_rows=20000, num_unique_clients=8000)


Generating 8000 first transactions...
Generating 12000 historic transactions...

Data generation complete!
First transactions DataFrame has '8000' rows.
Historic transactions DataFrame has '20000' rows.

--- First Transactions Sample ---
     id_client   loan_date loan_due_date payment_date     income culture  \
0  CLIENT_0000  2023-01-10    2023-07-09   2023-07-11    9100.56   Trigo   
1  CLIENT_0001  2021-01-02    2021-12-28   2021-12-30  390891.33   Aveia   
2  CLIENT_0002  2020-01-23    2020-04-22   2020-04-12  230789.03   Aveia   
3  CLIENT_0003  2021-06-07    2022-06-02   2022-05-27  101779.47    Soja   
4  CLIENT_0004  2023-01-24    2023-04-24   2023-04-14  106288.24   Aveia   

    hectar  total_hectares_client  loan_amount  loan_amount_term  rating  \
0   275.41                 911.73     495738.0               180       1   
1    96.93                 932.70     145395.0               360       5   
2   115.63                1822.43     173445.0                90       2   
3

In [16]:
# df_credit.describe()
# df_credit.head()
df_historic.head(20)

Unnamed: 0,id_client,loan_date,loan_due_date,payment_date,income,culture,hectar,total_hectares_client,loan_amount,loan_amount_term,rating,score,total_debt,default_status,transaction_type,payment_delay_days
0,CLIENT_0000,2023-01-10,2023-07-09,2023-07-11,9100.56,Trigo,275.41,911.73,495738.0,180,1,584,332841.58,1,first,2
1,CLIENT_0000,2023-06-14,2024-06-08,2024-09-01,222610.87,Milho,329.93,911.73,824825.0,360,0,498,761153.99,1,historic,85
2,CLIENT_0000,2024-01-16,2024-04-15,2024-05-22,400550.23,Aveia,343.22,911.73,514830.0,90,0,419,645774.36,1,historic,37
3,CLIENT_0001,2021-01-02,2021-12-28,2021-12-30,390891.33,Aveia,96.93,932.7,145395.0,360,5,792,216184.48,1,first,2
4,CLIENT_0001,2021-06-12,2022-06-07,2022-06-09,282184.84,Soja,279.45,932.7,838350.0,360,3,677,846467.86,1,historic,2
5,CLIENT_0002,2020-01-23,2020-04-22,2020-04-12,230789.03,Aveia,115.63,1822.43,173445.0,90,2,646,330440.28,0,first,-10
6,CLIENT_0002,2020-06-01,2020-08-30,2020-08-20,84424.18,Milho,614.27,1822.43,1535675.0,90,3,680,339824.59,0,historic,-10
7,CLIENT_0002,2021-01-04,2021-12-30,2021-12-27,328936.05,Trigo,543.49,1822.43,978282.0,360,3,695,978905.38,0,historic,-3
8,CLIENT_0003,2021-06-07,2022-06-02,2022-05-27,101779.47,Soja,1296.1,3814.13,3888300.0,360,2,631,656412.03,0,first,-6
9,CLIENT_0003,2022-01-19,2023-01-14,2023-01-16,455263.79,Aveia,720.33,3814.13,1080495.0,360,2,646,1255292.7,1,historic,2


In [17]:
# Filter clients who have ever defaulted (default_status == 1)
clients_with_default = df_historic[df_historic['default_status'] == 1]['id_client'].unique()
df_historic[df_historic['id_client'].isin(clients_with_default)]

Unnamed: 0,id_client,loan_date,loan_due_date,payment_date,income,culture,hectar,total_hectares_client,loan_amount,loan_amount_term,rating,score,total_debt,default_status,transaction_type,payment_delay_days
0,CLIENT_0000,2023-01-10,2023-07-09,2023-07-11,9100.56,Trigo,275.41,911.73,495738.0,180,1,584,332841.58,1,first,2
1,CLIENT_0000,2023-06-14,2024-06-08,2024-09-01,222610.87,Milho,329.93,911.73,824825.0,360,0,498,761153.99,1,historic,85
2,CLIENT_0000,2024-01-16,2024-04-15,2024-05-22,400550.23,Aveia,343.22,911.73,514830.0,90,0,419,645774.36,1,historic,37
3,CLIENT_0001,2021-01-02,2021-12-28,2021-12-30,390891.33,Aveia,96.93,932.70,145395.0,360,5,792,216184.48,1,first,2
4,CLIENT_0001,2021-06-12,2022-06-07,2022-06-09,282184.84,Soja,279.45,932.70,838350.0,360,3,677,846467.86,1,historic,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19992,CLIENT_7996,2022-01-17,2022-07-16,2022-07-19,493698.25,Aveia,1106.61,2965.36,1659915.0,180,2,614,1597831.97,1,first,3
19993,CLIENT_7996,2022-06-23,2022-09-21,2022-09-17,70971.11,Feijão,606.88,2965.36,2124080.0,90,1,532,1978334.41,0,historic,-4
19994,CLIENT_7996,2023-01-14,2024-01-09,2024-02-17,470953.58,Trigo,215.87,2965.36,388566.0,360,1,572,688412.07,1,historic,39
19995,CLIENT_7997,2021-06-25,2022-06-20,2022-07-10,148943.54,Soja,351.23,1628.94,1053690.0,360,2,644,757573.44,1,first,20


In [17]:
# df_credit_clear = df_credit.query("score != null or endividamento_curto != '' or endividamento_medio != '' or endividamento_longo != ''")
# clear dataset to remove null data
df_credit_clear = df_credit.dropna()

df_credit_clear.count()

id_client           20000
income              20000
culture             20000
hectar              20000
loan_amount         20000
loan_amount_term    20000
rating              20000
score               20000
total_debt          20000
default_status      20000
dtype: int64

In [9]:
df_credit_clear.sort_values(by="rating", ascending=True).head()
# df_credit_clear['amount'].describe()
df_credit_clear.columns

Index(['id_client', 'culture', 'hectar', 'parcelas', 'rating', 'score',
       'endividamento_curto', 'endividamento_medio', 'endividamento_longo',
       'amount', 'loan_date', 'inadimplencia'],
      dtype='object')

In [10]:
#Split dataset into features and target variable
dt_x = df_credit_clear.iloc[:,2:-1]
dt_y = df_credit_clear.iloc[:,-1]

In [11]:
#Split the dataset into training and testing sets
dt_x_train, dt_x_test, dt_y_train, dt_y_test = train_test_split(dt_x, dt_y, test_size=0.2, random_state=42)

In [12]:
scaler = StandardScaler()
# Fit the scaler on the training data
dt_x_train_scaled = pd.DataFrame( scaler.fit_transform(dt_x_train), columns=dt_x_train.columns) 
dt_x_test_scaled = pd.DataFrame( scaler.transform(dt_x_test), columns=dt_x_test.columns)

TypeError: float() argument must be a string or a real number, not 'Timestamp'

In [18]:

pca = PCA(n_components=8, random_state=8)
pca.fit(dt_x_train_scaled)
pca.explained_variance_ratio_

array([0.22814187, 0.12983052, 0.12828896, 0.12580436, 0.12264362,
       0.12213985, 0.1211973 , 0.02195353])

In [19]:
pca = PCA(n_components=3, random_state=10)
pca.fit(dt_x_train_scaled)

dt_x_train_pca = pca.transform(dt_x_train_scaled)
dt_x_test_pca = pca.transform(dt_x_test_scaled)

In [20]:
print(dt_x_train_pca.shape)
dt_x_test_pca.shape


(12372, 3)


(3094, 3)

In [21]:
max_depths_list = [5, None]
min_samples_split_list = [2,10]

parameter_grid = {
    'max_depth': max_depths_list,
    'min_samples_split': min_samples_split_list
}


estimator = DecisionTreeClassifier(random_state=10)
grid = GridSearchCV(estimator, param_grid=parameter_grid, scoring='accuracy' , cv=5)
grid.fit(dt_x_train_pca, dt_y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=10)
,param_grid,"{'max_depth': [5, None], 'min_samples_split': [2, 10]}"
,scoring,'accuracy'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,10
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [22]:
estimator = DecisionTreeClassifier(max_depth=5, min_samples_split=2, random_state=10)

essemble = BaggingClassifier(estimator=estimator, n_estimators=100, random_state=10)
essemble.fit(dt_x_train_pca, dt_y_train)

0,1,2
,estimator,DecisionTreeC...ndom_state=10)
,n_estimators,100
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,10

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,10
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
predictions = essemble.predict(dt_x_test_pca)
print(accuracy_score(dt_y_test, predictions))

0.8729799612152553
