In [87]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

In [88]:
df = pd.read_csv('data/UnCleanedKaggle/financial_loan.csv')

In [89]:
df.isna().sum()

id                          0
address_state               0
application_type            0
emp_length                  0
emp_title                1438
grade                       0
home_ownership              0
issue_date                  0
last_credit_pull_date       0
last_payment_date           0
loan_status                 0
next_payment_date           0
member_id                   0
purpose                     0
sub_grade                   0
term                        0
verification_status         0
annual_income               0
dti                         0
installment                 0
int_rate                    0
loan_amount                 0
total_acc                   0
total_payment               0
dtype: int64

In [90]:
df.emp_title.fillna('No title',inplace=True)


In [91]:
df['emp_length'].unique()

array(['< 1 year', '9 years', '4 years', '10+ years', '3 years',
       '5 years', '1 year', '6 years', '2 years', '7 years', '8 years'],
      dtype=object)

In [92]:
def parse(text: str) -> int:
        text = text.strip().lower()
        if text.startswith("<"):
            return 0                       
        elif "+" in text:
            return 10
        return int(text[0])

In [93]:
df['emp_years']=df['emp_length'].apply(lambda x: parse(x))
df['emp_years'].unique() 

array([ 0,  9,  4, 10,  3,  5,  1,  6,  2,  7,  8])

In [94]:
df['grade'].unique() 

array(['C', 'E', 'B', 'A', 'D', 'F', 'G'], dtype=object)

In [95]:
def grades(grade: str)->int: 
    dictionary_grade=({
        'A':7 , 
        'B':6, 
        'C':5, 
        'D':4,
        'E':3, 
        'F':2, 
        'G':1
    })
    if grade not in dictionary_grade.keys(): 
        print("something wrong") 
    else: 
        return dictionary_grade[grade]

In [96]:
df['grade_numeric']=df['grade'].apply(lambda x: grades(x))
df['grade_numeric'].unique() 

array([5, 3, 6, 7, 4, 2, 1])

In [97]:
df['sub_grade'].unique() 

array(['C4', 'E1', 'C5', 'B2', 'A1', 'C3', 'C2', 'A4', 'A5', 'B5', 'B4',
       'B3', 'B1', 'D1', 'A2', 'A3', 'D4', 'D2', 'C1', 'D3', 'E3', 'F1',
       'E2', 'E5', 'D5', 'E4', 'F2', 'G3', 'F3', 'G1', 'F4', 'G4', 'G2',
       'F5', 'G5'], dtype=object)

In [98]:
def sub_grades(sub_grade: str)->int: 
    sub_str=sub_grade.strip()
    grade,sub=sub_str[0],int(sub_str[1])
    dictionary_grade=({
        'A':7 , 
        'B':6, 
        'C':5, 
        'D':4,
        'E':3, 
        'F':2, 
        'G':1
    })
    if grade not in dictionary_grade.keys(): 
        print("something wrong") 
    else: 
        return int(dictionary_grade[grade]) + sub/6

In [99]:
df['sub_grade_numeric']=df['sub_grade'].apply(lambda x: sub_grades(x))
df['sub_grade_numeric'].unique() 

array([5.66666667, 3.16666667, 5.83333333, 6.33333333, 7.16666667,
       5.5       , 5.33333333, 7.66666667, 7.83333333, 6.83333333,
       6.66666667, 6.5       , 6.16666667, 4.16666667, 7.33333333,
       7.5       , 4.66666667, 4.33333333, 5.16666667, 4.5       ,
       3.5       , 2.16666667, 3.33333333, 3.83333333, 4.83333333,
       3.66666667, 2.33333333, 1.5       , 2.5       , 1.16666667,
       2.66666667, 1.66666667, 1.33333333, 2.83333333, 1.83333333])

In [100]:
def timeScraper(time_str: str) -> tuple[int, int, int,int]:
    dt = datetime.strptime(time_str, '%d-%m-%Y')  
    return dt.year, dt.month, dt.day ,dt.weekday


In [101]:
df[['issue_year', 'issue_month', 'issue_day', 'issue_weekday']] = df['issue_date'].apply(
    timeScraper).apply(pd.Series)

df[['last_credit_pull_year', 'last_credit_pull_month', 'last_credit_pull_day', 'last_credit_pull_weekday']] = df['last_credit_pull_date'].apply(
    timeScraper).apply(pd.Series)

df[['last_payment_year', 'last_payment_month', 'last_payment_day', 'last_payment_weekday']] = df['last_payment_date'].apply(
    timeScraper).apply(pd.Series)
df[['next_payment_year', 'next_payment_month', 'next_payment_day', 'next_payment_weekday']] = df['next_payment_date'].apply(
    timeScraper).apply(pd.Series)

In [102]:
df['term'].unique() 

array([' 60 months', ' 36 months'], dtype=object)

In [103]:
df['term_months']=df['term'].apply(lambda x : 60 if x==" 60 months" else 36) 
df['term_months'].unique()

array([60, 36])

In [105]:
df.drop(columns=['id','emp_length','grade','member_id','sub_grade','term'],axis=1,inplace=True)

In [106]:
df.to_csv("data/CleanedKaggle/financial_loan_cleaned.csv",index=False) 