In [148]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
warnings.filterwarnings('ignore')

In [149]:
df = pd.read_csv('data/UnCleanedKaggle/financial_loan.csv')

In [150]:
df.isna().sum()

id                          0
address_state               0
application_type            0
emp_length                  0
emp_title                1438
grade                       0
home_ownership              0
issue_date                  0
last_credit_pull_date       0
last_payment_date           0
loan_status                 0
next_payment_date           0
member_id                   0
purpose                     0
sub_grade                   0
term                        0
verification_status         0
annual_income               0
dti                         0
installment                 0
int_rate                    0
loan_amount                 0
total_acc                   0
total_payment               0
dtype: int64

In [151]:
df.emp_title.fillna('No title',inplace=True)


In [152]:
df['emp_length'].unique()

array(['< 1 year', '9 years', '4 years', '10+ years', '3 years',
       '5 years', '1 year', '6 years', '2 years', '7 years', '8 years'],
      dtype=object)

In [153]:
def parse(text: str) -> int:
        text = text.strip().lower()
        if text.startswith("<"):
            return 0                       
        elif "+" in text:
            return 10
        return int(text[0])

In [154]:
df['emp_years']=df['emp_length'].apply(lambda x: parse(x))
df['emp_years'].unique() 

array([ 0,  9,  4, 10,  3,  5,  1,  6,  2,  7,  8])

In [155]:
df['grade'].unique() 

array(['C', 'E', 'B', 'A', 'D', 'F', 'G'], dtype=object)

In [156]:
def grades(grade: str)->int: 
    dictionary_grade=({
        'A':0, 
        'B':1, 
        'C':2, 
        'D':3,
        'E':4, 
        'F':5, 
        'G':6
    })
    if grade not in dictionary_grade.keys(): 
        print("something wrong") 
    else: 
        return dictionary_grade[grade]

In [157]:
df['grade_numeric']=df['grade'].apply(lambda x: grades(x))
df['grade_numeric'].unique() 

array([2, 4, 1, 0, 3, 5, 6])

In [158]:
df['sub_grade'].unique() 

array(['C4', 'E1', 'C5', 'B2', 'A1', 'C3', 'C2', 'A4', 'A5', 'B5', 'B4',
       'B3', 'B1', 'D1', 'A2', 'A3', 'D4', 'D2', 'C1', 'D3', 'E3', 'F1',
       'E2', 'E5', 'D5', 'E4', 'F2', 'G3', 'F3', 'G1', 'F4', 'G4', 'G2',
       'F5', 'G5'], dtype=object)

In [159]:
def sub_grades(sub_grade: str)->int: 
    sub_str=sub_grade.strip()
    grade,sub=sub_str[0],int(sub_str[1])-1
    dictionary_grade=({
        'A':0 , 
        'B':1, 
        'C':2, 
        'D':3,
        'E':4, 
        'F':5, 
        'G':6
    })
    if grade not in dictionary_grade.keys(): 
        print("something wrong") 
    else: 
        return int(dictionary_grade[grade]) + sub/5

In [160]:
df['sub_grade_numeric']=df['sub_grade'].apply(lambda x: sub_grades(x))
df['sub_grade_numeric'].unique() 

array([2.6, 4. , 2.8, 1.2, 0. , 2.4, 2.2, 0.6, 0.8, 1.8, 1.6, 1.4, 1. ,
       3. , 0.2, 0.4, 3.6, 3.2, 2. , 3.4, 4.4, 5. , 4.2, 4.8, 3.8, 4.6,
       5.2, 6.4, 5.4, 6. , 5.6, 6.6, 6.2, 5.8, 6.8])

In [161]:
def timeScraper(time_str: str) -> tuple[int, int, int,int]:
    dt = datetime.strptime(time_str, '%d-%m-%Y')  
    return dt.year, dt.month, dt.day ,dt.weekday


In [None]:
df[['issue_year', 'issue_month', 'issue_day', 'issue_weekday']] = df['issue_date'].apply(
    timeScraper).apply(pd.Series)

df[['last_credit_pull_year', 'last_credit_pull_month', 'last_credit_pull_day', 'last_credit_pull_weekday']] = df['last_credit_pull_date'].apply(
    timeScraper).apply(pd.Series)

df[['last_payment_year', 'last_payment_month', 'last_payment_day', 'last_payment_weekday']] = df['last_payment_date'].apply(
    timeScraper).apply(pd.Series)
df[['next_payment_year', 'next_payment_month', 'next_payment_day', 'next_payment_weekday']] = df['next_payment_date'].apply(
    timeScraper).apply(pd.Series)

In [None]:
df['term'].unique() 

In [None]:
df['term_months']=df['term'].apply(lambda x : 60 if x==" 60 months" else 36) 
df['term_months'].unique()

In [None]:
df.drop(columns=['id','emp_length','member_id','term'],axis=1,inplace=True)

In [None]:
df.to_csv("data/CleanedKaggle/financial_loan_cleaned.csv",index=False) 