In [71]:
import requests
import pandas as pd

# Define a list of table names and their corresponding API endpoints
tables = {
            
        "Bureau" : 'http://127.0.0.1:5000/bureau',
        "Bureau_balance" : 'http://127.0.0.1:5000/bureau_balance',
        "Credit_card_balance" : 'http://127.0.0.1:5000/credit_card_balance',
        "Installments_payments" : 'http://127.0.0.1:5000/installments_payments',
        "POS_CASH_balance" : 'http://127.0.0.1:5000/pOS_CASH_balance',
        "Previous_application" : 'http://127.0.0.1:5000/previous_application',
        "Application_train" : 'http://127.0.0.1:5000/application_train'
}

# Dictionary to store DataFrames for each table
dataframes = {}

# Make API requests to get data from each table
for table_name, endpoint in tables.items():
    response = requests.get(endpoint)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Convert API response to a DataFrame and store it in the dictionary
        dataframes[table_name] = pd.DataFrame(response.json())
        print(f"Data for {table_name} fetched successfully.")
    else:
        print(f"Error: Unable to fetch data from {table_name}. Status code: {response.status_code}")




Data for Bureau fetched successfully.
Data for Bureau_balance fetched successfully.
Data for Credit_card_balance fetched successfully.
Data for Installments_payments fetched successfully.
Data for POS_CASH_balance fetched successfully.
Data for Previous_application fetched successfully.
Data for Application_train fetched successfully.


In [102]:
# Merge 'Bureau' table with 'Bureau_balance' table on 'SK_ID_BUREAU'
merged_data = pd.merge(dataframes['Bureau'], dataframes['Bureau_balance'], on='SK_ID_BUREAU', how='left')

# Specify the common columns for merging
common_columns = ['SK_ID_CURR']

# Perform joins based on the 'SK_ID_CURR' column for other tables
for table_name, df in dataframes.items():
    if table_name not in ['Bureau', 'Bureau_balance']:
        # Select only the columns that are not already present in the merged DataFrame
        additional_columns = [col for col in df.columns if col not in merged_data.columns]
        
        # Merge based on 'SK_ID_CURR' and additional columns
        merged_data = pd.merge(merged_data, df[common_columns + additional_columns], on='SK_ID_CURR', how='left')
# Remove duplicate columns
merged_data = merged_data.loc[:,~merged_data.columns.duplicated()]

# Remove duplicate rows from the merged DataFrame
data_df = merged_data.drop_duplicates()
# Example: Print the first few rows of the cleaned DataFrame
print("Cleaned Data:")
print(data_df.head())

Cleaned Data:
   AMT_ANNUITY  AMT_CREDIT_MAX_OVERDUE  AMT_CREDIT_SUM  AMT_CREDIT_SUM_DEBT  \
0          0.0                     0.0      1530000.00             402016.5   
1      18000.0                     0.0       180000.00             160861.5   
2       7650.0                     0.0       675000.00             316930.5   
3          0.0                     0.0       136647.45                  0.0   
4          0.0                     0.0       136647.45                  0.0   

   AMT_CREDIT_SUM_LIMIT  AMT_CREDIT_SUM_OVERDUE  CNT_CREDIT_PROLONG  \
0                   0.0                     0.0                   0   
1                   0.0                     0.0                   0   
2                   0.0                     0.0                   0   
3                   0.0                     0.0                   0   
4                   0.0                     0.0                   0   

  CREDIT_ACTIVE CREDIT_CURRENCY  CREDIT_DAY_OVERDUE  ... OCCUPATION_TYPE  \
0       

In [103]:
data_df=merged_data.fillna(0)

In [104]:
data_df.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CNT_CREDIT_PROLONG,CREDIT_ACTIVE,CREDIT_CURRENCY,CREDIT_DAY_OVERDUE,...,OCCUPATION_TYPE,ORGANIZATION_TYPE,OWN_CAR_AGE,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,TARGET
0,0.0,0.0,1530000.0,402016.5,0.0,0.0,0,Active,currency 1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18000.0,0.0,180000.0,160861.5,0.0,0.0,0,Active,currency 1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7650.0,0.0,675000.0,316930.5,0.0,0.0,0,Active,currency 1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,136647.45,0.0,0.0,0.0,0,Closed,currency 1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,136647.45,0.0,0.0,0.0,0,Closed,currency 1,0,...,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
print(data_df.shape)

(181912, 124)


In [106]:
X = data_df.drop(columns=['TARGET'], axis=1)

In [107]:
# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X).astype(int)

In [108]:
X.head()

Unnamed: 0,AMT_ANNUITY,AMT_CREDIT_MAX_OVERDUE,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CNT_CREDIT_PROLONG,CREDIT_DAY_OVERDUE,DAYS_CREDIT,DAYS_CREDIT_ENDDATE,...,ORGANIZATION_TYPE_Trade: type 4,ORGANIZATION_TYPE_Trade: type 5,ORGANIZATION_TYPE_Trade: type 6,ORGANIZATION_TYPE_Trade: type 7,ORGANIZATION_TYPE_Transport: type 1,ORGANIZATION_TYPE_Transport: type 2,ORGANIZATION_TYPE_Transport: type 3,ORGANIZATION_TYPE_Transport: type 4,ORGANIZATION_TYPE_University,ORGANIZATION_TYPE_XNA
0,0,0,1530000,402016,0,0,0,0,-763,0,...,0,0,0,0,0,0,0,0,0,0
1,18000,0,180000,160861,0,0,0,0,-1643,0,...,0,0,0,0,0,0,0,0,0,0
2,7650,0,675000,316930,0,0,0,0,-939,885,...,0,0,0,0,0,0,0,0,0,0
3,0,0,136647,0,0,0,0,0,-928,-624,...,0,0,0,0,0,0,0,0,0,0
4,0,0,136647,0,0,0,0,0,-928,-624,...,0,0,0,0,0,0,0,0,0,0


In [109]:
y = data_df['TARGET'].values

In [110]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [111]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Get probability outputs for the test set
# This will return a matrix where each row corresponds to an instance in X_test
# and each column corresponds to a class (in this case, 0 or 1)
probabilities = model.predict_proba(X_test)

In [117]:
# Extract the probability of the positive class (class 1)
positive_class_probabilities = probabilities[:, 1]

# Define custom thresholds for each class
thresholds = [0.2, 0.4, 0.6, 0.8]

# Classify into 5 classes based on custom thresholds
risk_levels = pd.cut(positive_class_probabilities, bins=[-float('inf')] + thresholds + [float('inf')],
                     labels=['Very Low Risk', 'Low Risk', 'Medium Risk', 'High Risk', 'Very High Risk'], include_lowest=True)

# Create a DataFrame with 'SK_ID_CURR', probability, and risk level
result_df = pd.DataFrame({'SK_ID_CURR': X_test['SK_ID_CURR'],'Probability': positive_class_probabilities, 'Risk Level': risk_levels})
result_df.sort_values(by='SK_ID_CURR', ascending=True, inplace=True)
result_df = result_df.drop_duplicates()
result_df = result_df.reset_index(drop=True)
# Print the result DataFrame
print(result_df)

       SK_ID_CURR  Probability     Risk Level
0          100013          0.0  Very Low Risk
1          100019          0.0  Very Low Risk
2          100027          0.0  Very Low Risk
3          100037          0.0  Very Low Risk
4          100044          0.0  Very Low Risk
...           ...          ...            ...
14596      456139          0.0  Very Low Risk
14597      456150          0.0  Very Low Risk
14598      456162          0.0  Very Low Risk
14599      456202          0.0  Very Low Risk
14600      456231          0.0  Very Low Risk

[14601 rows x 3 columns]


In [113]:
output = result_df.to_csv('output.csv')

In [114]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score


data_df = data_df.drop_duplicates()
# Assuming 'target_variable' is your target variable (0 or 1)
X = data_df.drop('TARGET', axis=1)
y = data_df['TARGET']

# Replace NaN values with appropriate imputation strategies
X.fillna(0, inplace=True)  # Replace NaN values with 0 for simplicity; you may need a better imputation strategy

# Convert categorical variables to dummy/indicator variables
X = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Get probability outputs for the test set
probabilities = model.predict_proba(X_test)

# Extract the probability of the positive class (class 1)
positive_class_probabilities = probabilities[:, 1]

# Define a threshold to make binary predictions
threshold = 0.5
binary_predictions = (positive_class_probabilities > threshold).astype(int)

# Calculate the confusion matrix
conf_matrix = confusion_matrix(y_test, binary_predictions)

# Calculate accuracy
accuracy = accuracy_score(y_test, binary_predictions)

# Print the confusion matrix and accuracy
print("Confusion Matrix:")
print(conf_matrix)
print("\nAccuracy:", accuracy)

Confusion Matrix:
[[34082     0]
 [   76   483]]

Accuracy: 0.9978060679541584


In [91]:
report = classification_report(y_test, binary_predictions)
print("\nClassification Report:")
print(report)


Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     34082
         1.0       1.00      0.86      0.93       559

    accuracy                           1.00     34641
   macro avg       1.00      0.93      0.96     34641
weighted avg       1.00      1.00      1.00     34641



In [116]:
# Create a DataFrame with 'SK_ID_CURR', probability, and risk level
result_df = pd.DataFrame({'SK_ID_CURR': X_test['SK_ID_CURR'],'Probability': positive_class_probabilities})
result_df.sort_values(by='SK_ID_CURR', ascending=True, inplace=True)
result_df = result_df.drop_duplicates()
result_df = result_df.reset_index(drop=True)
# Print the result DataFrame
print(result_df)

       SK_ID_CURR  Probability
0          100013          0.0
1          100019          0.0
2          100027          0.0
3          100037          0.0
4          100044          0.0
...           ...          ...
14596      456139          0.0
14597      456150          0.0
14598      456162          0.0
14599      456202          0.0
14600      456231          0.0

[14601 rows x 2 columns]
