In [55]:
# import dependencies
from sklearn.preprocessing import StandardScaler
import requests
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.linear_model import Lasso


In [56]:

# Define a list of table names and their corresponding API endpoints
tables = {
            
        "Bureau" : 'http://127.0.0.1:5000/bureau',
        "Bureau_balance" : 'http://127.0.0.1:5000/bureau_balance',
        "Credit_card_balance" : 'http://127.0.0.1:5000/credit_card_balance',
        "Installments_payments" : 'http://127.0.0.1:5000/installments_payments',
        "POS_CASH_balance" : 'http://127.0.0.1:5000/pOS_CASH_balance',
        "Previous_application" : 'http://127.0.0.1:5000/previous_application',
        "Application_train" : 'http://127.0.0.1:5000/application_train'
}

# Dictionary to store DataFrames for each table
dataframes = {}

# Make API requests to get data from each table
for table_name, endpoint in tables.items():
    response = requests.get(endpoint)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        # Convert API response to a DataFrame and store it in the dictionary
        dataframes[table_name] = pd.DataFrame(response.json())
        print(f"Data for {table_name} fetched successfully.")
    else:
        print(f"Error: Unable to fetch data from {table_name}. Status code: {response.status_code}")

Data for Bureau fetched successfully.
Data for Bureau_balance fetched successfully.
Data for Credit_card_balance fetched successfully.
Data for Installments_payments fetched successfully.
Data for POS_CASH_balance fetched successfully.
Data for Previous_application fetched successfully.
Data for Application_train fetched successfully.


In [57]:
# Merge tables based on specified keys
merged_table = dataframes["Application_train"]

# Merge POS_CASH_balance, Installments_payments, Credit_card_balance based on SK_ID_PREV
for table_name in ["POS_CASH_balance", "Installments_payments", "Credit_card_balance", "Previous_application"]:
    if table_name in dataframes:
        # Specify suffixes to avoid duplicate column names
        merged_table = pd.merge(merged_table, dataframes[table_name], on='SK_ID_CURR', how='left', suffixes=('', f'_{table_name}'))

# Merge Bureau and Bureau_balance based on SK_ID_BUREAU
if "Bureau" in dataframes and "Bureau_balance" in dataframes:
    bureau_merged = pd.merge(dataframes["Bureau"], dataframes["Bureau_balance"], on='SK_ID_BUREAU', how='left', suffixes=('_bureau', '_bureau_balance'))
    merged_table = pd.merge(merged_table, bureau_merged, on='SK_ID_CURR', how='left')

# Display the final merged table with unique columns
unique_columns = merged_table.columns.unique()
final_table = merged_table[unique_columns]

# Display the final table
print(final_table.head())

   AMT_ANNUITY_x  AMT_CREDIT  AMT_GOODS_PRICE  AMT_INCOME_TOTAL  \
0      12217.500  251280.000       180000.000        292500.000   
1      12217.500  251280.000       180000.000        292500.000   
2      12217.500  251280.000       180000.000        292500.000   
3      12217.500  251280.000       180000.000        292500.000   
4      12217.500  251280.000       180000.000        292500.000   

   AMT_REQ_CREDIT_BUREAU_DAY  AMT_REQ_CREDIT_BUREAU_HOUR  \
0                      0.000                       0.000   
1                      0.000                       0.000   
2                      0.000                       0.000   
3                      0.000                       0.000   
4                      0.000                       0.000   

   AMT_REQ_CREDIT_BUREAU_MON  AMT_REQ_CREDIT_BUREAU_QRT  \
0                      0.000                      2.000   
1                      0.000                      2.000   
2                      0.000                      2.000   


In [58]:
data_df = final_table.drop_duplicates()

In [59]:
data_df=final_table.fillna(0)

In [60]:
data_df['CODE_GENDER'] = data_df['CODE_GENDER'].replace({'F': 0, 'M': 1})
# Changing FLAG_OWN_REALTY AND FLAG_OWN_CAR to 0 and 1 to match model
data_df[['FLAG_OWN_REALTY', 'FLAG_OWN_CAR']] = data_df[['FLAG_OWN_REALTY', 'FLAG_OWN_CAR']].replace({'Y': 1, 'N': 0}).astype(int)


In [61]:
X = data_df[["DAYS_ID_PUBLISH", "DAYS_BIRTH", "DAYS_REGISTRATION", "DAYS_LAST_PHONE_CHANGE",
              "AMT_ANNUITY_x", "SK_ID_CURR", "DAYS_EMPLOYED", "AMT_GOODS_PRICE", "AMT_INCOME_TOTAL",
              "HOUR_APPR_PROCESS_START", "AMT_REQ_CREDIT_BUREAU_YEAR", "OWN_CAR_AGE",
              "OBS_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "AMT_PAYMENT",
              "DAYS_ENTRY_PAYMENT", "AMT_INSTALMENT", "DAYS_INSTALMENT", "NUM_INSTALMENT_NUMBER",
              "CNT_INSTALMENT_FUTURE", "MONTHS_BALANCE_x", "CNT_FAM_MEMBERS", "CNT_INSTALMENT",
              "CNT_INSTALMENT_MATURE_CUM", "MONTHS_BALANCE_Credit_card_balance"]]

In [62]:
y = data_df['AMT_CREDIT']

In [63]:

# Generate some sample data
np.random.seed(42)
X = np.random.rand(100, 5)
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.randn(100) * 0.1  # Sample pricing

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict with Random Forest
rf_predictions = rf_model.predict(X_test_scaled)

# Calculate RMSE for Random Forest
rf_rmse = sqrt(mean_squared_error(y_test, rf_predictions))
print(f"Random Forest RMSE: {rf_rmse}")

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict with XGBoost
xgb_predictions = xgb_model.predict(X_test)

# Calculate RMSE for XGBoost
xgb_rmse = sqrt(mean_squared_error(y_test, xgb_predictions))
print(f"XGBoost RMSE: {xgb_rmse}")

# Train Lasso Regression model
lasso_model = Lasso(alpha=0.01, random_state=42)
lasso_model.fit(X_train_scaled, y_train)

# Predict with Lasso Regression
lasso_predictions = lasso_model.predict(X_test_scaled)

# Calculate RMSE for Lasso Regression
lasso_rmse = sqrt(mean_squared_error(y_test, lasso_predictions))
print(f"Lasso Regression RMSE: {lasso_rmse}")

# Train Neural Network Regression model
nn_model = MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
nn_model.fit(X_train_scaled, y_train)

# Predict with Neural Network Regression
nn_predictions = nn_model.predict(X_test_scaled)

# Calculate RMSE for Neural Network Regression
nn_rmse = sqrt(mean_squared_error(y_test, nn_predictions))
print(f"Neural Network Regression RMSE: {nn_rmse}")


Random Forest RMSE: 0.21798580636574888
XGBoost RMSE: 0.2898705546802953
Lasso Regression RMSE: 0.09850914453283427
Neural Network Regression RMSE: 0.24162629190062188


In [64]:

# Generate some sample data
np.random.seed(42)
X = np.random.rand(100, 5)
y = 3 * X[:, 0] + 2 * X[:, 1] - X[:, 2] + np.random.randn(100) * 0.1  # Sample pricing

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a StandardScaler instance

scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

# Train Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict with Random Forest
rf_predictions = rf_model.predict(X_test_scaled)

# Calculate RMSE for Random Forest
rf_rmse = sqrt(mean_squared_error(y_test, rf_predictions))
print(f"Random Forest RMSE: {rf_rmse}")

# Train XGBoost model
xgb_model = XGBRegressor(n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict with XGBoost
xgb_predictions = xgb_model.predict(X_test)

# Calculate RMSE for XGBoost
xgb_rmse = sqrt(mean_squared_error(y_test, xgb_predictions))
print(f"XGBoost RMSE: {xgb_rmse}")


Random Forest RMSE: 0.21798580636574888
XGBoost RMSE: 0.2898705546802953


In [65]:
X = data_df[["DAYS_ID_PUBLISH", "DAYS_BIRTH", "DAYS_REGISTRATION", "DAYS_LAST_PHONE_CHANGE",
              "AMT_ANNUITY_x", "SK_ID_CURR", "DAYS_EMPLOYED", "AMT_GOODS_PRICE", "AMT_INCOME_TOTAL",
              "HOUR_APPR_PROCESS_START", "AMT_REQ_CREDIT_BUREAU_YEAR", "OWN_CAR_AGE",
              "OBS_30_CNT_SOCIAL_CIRCLE", "OBS_60_CNT_SOCIAL_CIRCLE", "AMT_PAYMENT",
              "DAYS_ENTRY_PAYMENT", "AMT_INSTALMENT", "DAYS_INSTALMENT", "NUM_INSTALMENT_NUMBER",
              "CNT_INSTALMENT_FUTURE", "MONTHS_BALANCE_x", "CNT_FAM_MEMBERS", "CNT_INSTALMENT",
              "CNT_INSTALMENT_MATURE_CUM", "MONTHS_BALANCE_Credit_card_balance"]]

In [66]:
y = data_df['AMT_CREDIT']

In [67]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler and transform the data
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Convert categorical columns to dummy variables
X_train_dummy = pd.get_dummies(X_train_scaled).astype(int)
X_test_dummy = pd.get_dummies(X_test_scaled).astype(int)

# Train a RandomForestRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_dummy, y_train)

# Predict on the test set
predictions = rf_model.predict(X_test_dummy)


In [68]:
# Calculate RMSE for Lasso Regression
random_forest_rmse = sqrt(mean_squared_error(y_test, predictions))
print(f"RandomForest Regression RMSE: {random_forest_rmse}")

RandomForest Regression RMSE: 176227.94133403778


In [69]:
# Create a DataFrame with actual and predicted values
result_df = pd.DataFrame({'Actual Values': y_test.astype(int), 'Predicted Prices': predictions.astype(int)}).reset_index(drop=True)

# Display the result DataFrame without scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(result_df)

       Actual Values  Predicted Prices
0             310671            641387
1             450000            517309
2             180000            105233
3            1185282           1185282
4             360000            606637
...              ...               ...
31578         601470            591765
31579        1268743           1163699
31580         270000            434464
31581        1056447            728437
31582         270000            554922

[31583 rows x 2 columns]


In [70]:

# Assuming X and y are already defined
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler and transform the data
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# Convert categorical columns to dummy variables
X_train_dummy = pd.get_dummies(X_train_scaled).astype(int)
X_test_dummy = pd.get_dummies(X_test_scaled).astype(int)

# Train a Lasso Regression model with increased max_iter
lasso_model = Lasso(alpha=0.01, max_iter=10000, random_state=42)  # Adjust alpha and max_iter as needed
lasso_model.fit(X_train_dummy, y_train)

# Predict on the test set
lasso_predictions = lasso_model.predict(X_test_dummy)

# Calculate RMSE for Lasso Regression
lasso_rmse = sqrt(mean_squared_error(y_test, lasso_predictions))
print(f"Lasso Regression RMSE: {lasso_rmse}")


Lasso Regression RMSE: 212675.08069897618


In [71]:
# Create a DataFrame with actual and predicted values
result_df_lasso = pd.DataFrame({'Actual Values': y_test.astype(int), 'Predicted Prices': lasso_predictions.astype(int)}).reset_index(drop=True)

# Display the result DataFrame without scientific notation
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(result_df_lasso)

       Actual Values  Predicted Prices
0             310671            591637
1             450000            576300
2             180000            111016
3            1185282            986783
4             360000            542411
...              ...               ...
31578         601470            570020
31579        1268743           1015932
31580         270000            562247
31581        1056447            589512
31582         270000            572804

[31583 rows x 2 columns]
