In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

df = pd.concat([train_df, test_df])

df.head()

In [None]:
df.head()

**1. Data Cleaning and Preprocessing:**
So many missing values, we will need to treat each column depending on the nature of the data in each column and the overall context of your project.

In [None]:
missing_values = df.isnull().sum()
missing_values[missing_values > 0]

In [None]:
#<--Name-->
name_value_counts = df['Name'].value_counts()
top_names = name_value_counts.head(10)
num_unique_names = df['Name'].nunique()

df = df.copy()
df['Name'].fillna('J. Doe', inplace=True)
df['Name'].isnull().sum()

top_names, num_unique_names


In [None]:
# <--Monthly_Inhand_Salary--> cell 1
salary_distribution = df['Monthly_Inhand_Salary'].describe()
print(salary_distribution)

df['Monthly_Inhand_Salary'].isna().sum()



In [None]:
# <--Monthly_Inhand_Salary--> cell 2
median_salary = df['Monthly_Inhand_Salary'].median() #<--- given the distribution of the "Monthly_Inhand_Salary" column. see(Skewed Distribution, Range Values, Median value)
df['Monthly_Inhand_Salary'].fillna(median_salary, inplace=True)
salary_distribution = df['Monthly_Inhand_Salary'].describe()
salary_distribution

In [None]:
# <--Type of Loan-->
loan_type_counts = df['Type_of_Loan'].value_counts()

top_loan_types = loan_type_counts.head(10)
num_unique_loan_types = df['Type_of_Loan'].nunique()

df['Type_of_Loan'].isna().sum()


df['Type_of_Loan'].fillna('Not Specified', inplace=True)   #<--- given the distribution of the "Type_of_Loan" column. see(Not Specified, Range Values)


top_loan_types, num_unique_loan_types


In [None]:
# <--Number of Delayed Payments-->
data_type_delayed_payments = df['Num_of_Delayed_Payment'].dtype

df['Num_of_Delayed_Payment'] = pd.to_numeric(df['Num_of_Delayed_Payment'], errors='coerce') #<--- numbers into numeric types
skewness_delayed_payments = df['Num_of_Delayed_Payment'].skew()
summary_delayed_payments = df['Num_of_Delayed_Payment'].describe()

data_type_delayed_payments, skewness_delayed_payments, summary_delayed_payments


In [None]:
# <--Number of Delayed Payments--> cell 2
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].apply(lambda x: 0 if x < 0 else x)

percentile_95 = df['Num_of_Delayed_Payment'].quantile(0.95)  #<--- reducing the impact of extreme outliers
df['Num_of_Delayed_Payment'] = df['Num_of_Delayed_Payment'].apply(lambda x: percentile_95 if x > percentile_95 else x)

median_delayed_payments = df['Num_of_Delayed_Payment'].median()
df['Num_of_Delayed_Payment'].fillna(median_delayed_payments, inplace=True)

new_summary_delayed_payments = df['Num_of_Delayed_Payment'].describe()
new_summary_delayed_payments


In [None]:
# <--Number of Credit inquires --> cell 1
skewness_credit_inquiries = df['Num_Credit_Inquiries'].skew()
distribution_credit_inquiries = df['Num_Credit_Inquiries'].describe()

skewness_credit_inquiries, distribution_credit_inquiries


In [None]:
# <--Number of Credit inquires --> cell 2

# <- 9.72 Skewness, highly right skewed, and precense of outliers
df['Num_Credit_Inquiries'] = df['Num_Credit_Inquiries'].apply(lambda x: 0 if x < 0 else x)
new_skewness_credit_inquiries2 = df['Num_Credit_Inquiries'].skew()

percentile_95_inquiries = df['Num_Credit_Inquiries'].quantile(0.95)
df['Num_Credit_Inquiries'] = df['Num_Credit_Inquiries'].apply(lambda x: percentile_95_inquiries if x > percentile_95_inquiries else x)

median_credit_inquiries = df['Num_Credit_Inquiries'].median()
df['Num_Credit_Inquiries'].fillna(median_credit_inquiries, inplace=True)

new_summary_credit_inquiries = df['Num_Credit_Inquiries'].describe()
new_summary_credit_inquiries, new_skewness_credit_inquiries2


In [None]:
# <--Years of Credit History--> cell 1

def convert_to_total_months(age_str):
    if pd.isna(age_str):
        return None
    parts = age_str.split(' ')
    years = int(parts[0]) if parts[0].isdigit() else 0
    months = int(parts[3]) if len(parts) > 3 and parts[3].isdigit() else 0
    return years * 12 + months

df['Credit_History_Age'] = df['Credit_History_Age'].apply(convert_to_total_months)
credit_history_age_skewness = df['Credit_History_Age'].skew()
credit_history_age_distribution = df['Credit_History_Age'].describe()
credit_history_age_distribution, credit_history_age_skewness



In [None]:
# <--Years of Credit History--> cell 2
median_credit_history = df['Credit_History_Age'].median()
df['Credit_History_Age'].fillna(median_credit_history, inplace=True)
new_skewness_credit_history = df['Credit_History_Age'].skew()
new_summary_credit_history = df['Credit_History_Age'].describe()
new_summary_credit_history, new_skewness_credit_history

In [None]:
# <-- Amount invested monthly --> cell 1
# convert  to a numeric format
df['Amount_invested_monthly'] = pd.to_numeric(df['Amount_invested_monthly'], errors='coerce')

amount_invested_distribution = df['Amount_invested_monthly'].describe()
amount_invested_skewness = df['Amount_invested_monthly'].skew()
amount_invested_distribution, amount_invested_skewness


In [None]:
# <-- Amount invested monthly --> cell 2
median_amount_invested = df['Amount_invested_monthly'].median()
df['Amount_invested_monthly'].fillna(median_amount_invested, inplace=True)

new_summary_amount_invested = df['Amount_invested_monthly'].describe()
new_skewness_amount_invested = df['Amount_invested_monthly'].skew()
new_summary_amount_invested, new_skewness_amount_invested


In [None]:
# <-- Monthly balance --> cell 1
unique = df['Monthly_Balance'].nunique()
unique

# converted to a numeric format
df['Monthly_Balance'] = pd.to_numeric(df['Monthly_Balance'], errors='coerce')

monthly_balance_distribution = df['Monthly_Balance'].describe()
monthly_balance_skewness = df['Monthly_Balance'].skew()
monthly_balance_distribution, monthly_balance_skewness, unique


In [None]:
# <-- Monthly balance --> cell 2
median_monthly_balance = df['Monthly_Balance'].median()
df['Monthly_Balance'].fillna(median_monthly_balance, inplace=True)

new_summary_monthly_balance = df['Monthly_Balance'].describe()
new_skewness_monthly_balance = df['Monthly_Balance'].skew()
new_summary_monthly_balance, new_skewness_monthly_balance


In [None]:
# <-- Credit Score -->
checktarget = df['Credit_Score'].isnull().sum()
target = df['Credit_Score'].describe()
target, checktarget

# I realized that the Credit_Score column values from test set were NaN, so I will split the data into train and test sets based on the Credit_Score column


In [None]:
missing_values_train = df.isnull().sum()
missing_values_train[missing_values_train > 0]

In [None]:
df.columns

**1. Data Cleaning and Preprocessing:** Moving into converting categorical data into more suitable format

In [None]:
# Listing categorical columns in the dataset
categorical_columns = df.select_dtypes(include=['object', 'category']).columns
categorical_collumns_head = df.select_dtypes(include=['object', 'category']).describe()
# Displaying the categorical columns and the number of unique values in each
categorical_columns_summary = {col: df[col].nunique() for col in categorical_columns}
categorical_columns_summary, categorical_collumns_head