In [1]:
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("../input/home-credit-default-risk/application_train.csv")
test = pd.read_csv("../input/home-credit-default-risk/application_test.csv")

## **Exploratory Data Analysis**

**First 5 rows**

In [3]:
train.head()

**Shape of the data**

In [4]:
print("The application_train.csv has {} entires.".format(train.shape))
print("The application_test.csv has {} entires.".format(test.shape))

**Available columns and total number of columns**

In [5]:
train.columns

**Checking the datatypes**

In [6]:
train.dtypes

In [7]:
train.select_dtypes(include=['object']).columns.tolist()

**Check out the stats**

In [8]:
train.describe(include="all")

**Overview of the data**

In [9]:
train.info()

### **Who is the highest borrower? Male or Female?**

In [10]:
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x='CODE_GENDER',data=train)

In [11]:
print("Females are the highest borrowers with counts:\n{}".format(train.CODE_GENDER.value_counts()))

### **How is the distribution of target labels? - Did most people return on time ?**

0: Loan was repaid       1: Loan was not repaid 

In [12]:
fig, ax = plt.subplots(figsize=(10,7))
sns.countplot(x ='TARGET',data=train, hue='TARGET',palette="Set1")

In [32]:
train['TARGET'].value_counts()

Based on the description, most people returned the money. Very clearly the target label is imbalanced.

### **Who are the major borrowers? - What are their occupations?**

In [13]:
fig, ax = plt.subplots(figsize=(15,7))
sns.countplot(x='OCCUPATION_TYPE',data=train)
plt.xlabel("Occupation Type")
plt.xticks(rotation=70)

Most of the clients are laborers and the least of the clients are IT Staff.


### **How economically stable are clients? Who are the most and least stable?**

In [14]:
fig, ax = plt.subplots(figsize=(15,7))
sns.barplot(x='OCCUPATION_TYPE',y='AMT_INCOME_TOTAL',data=train)
plt.xticks(rotation=70)
plt.xlabel("Occupation Type")
plt.ylabel("Average Annual family income")

Managers are the most earning borrowers while cleaning staff are the least earning borrowers - Based on the annual family income.

### **Which category of occupants repay on time and are better clients for company to lend money?**

In [15]:
fig, ax = plt.subplots(figsize=(15,7))
sns.countplot(x='OCCUPATION_TYPE',hue='TARGET',data=train,palette="Set2")
plt.xticks(rotation=70)
plt.xlabel("Occupation Type")


Right off the bat, it seems as if the labourers have the highest difficulty in repaying. Also it seems lending to Reality agents, IT staff, HR staff is the safest.

**This is not a better way to conclude, because this contains baised number of applicants.**

**A better way is to find a metric that incorporates relative relationship between applicants count and repayers count.**


### Let us look at the number of repayer's to number of applicants ratio in every occupation category.

In [16]:
# get the number of people having occupation type and target grouped.
Occupation_df = pd.DataFrame(data=train.groupby(['OCCUPATION_TYPE','TARGET']).count()['SK_ID_CURR'])
Occupation_df

In [17]:
# reset the multiindex organization of dataframe.
Occupation_df = Occupation_df.reset_index() 
Occupation_df

In [18]:
# get the number of people grouped on type of occupation and target in an array form.
value_counts = Occupation_df['SK_ID_CURR'].values
value_counts

In [19]:
def repayers_to_applicants_ratio(values):
    """
    Finds the ratio of Repayers to Applicants. This kind of is a 
    measure for safety. Larger the value better the applicant - More 
    safe for the company to lend loan to this category of workers.
    
    values: array of entires whose counts are given
    returns the repayers to applicants ratio. 
    
    precondition: The counts are such that the targets alligned are
    in order 0 and 1
    """
    flag = 1
    ratios = []
    for count in range(len(values)):
        if flag == 1:
            current_number = values[count]
            next_number = values[count+1]
            ratios.append(current_number/(current_number+next_number))
            ratios.append(current_number/(current_number+next_number))
        flag=flag*-1
    return ratios       

In [20]:
# find the ratios from the array values
Occupation_df['Ratio R/A'] = repayers_to_applicants_ratio(value_counts)

### **Repayment ratio based on Occupation Type.**

In [21]:
# get the ratio and values based on the order of saftety.

Occupation_ratio_df = Occupation_df.groupby(['OCCUPATION_TYPE','Ratio R/A']).count().drop(['TARGET', 'SK_ID_CURR'],axis=1)
Occupation_ratio_df = Occupation_ratio_df.reset_index() 
Occupation_ratio_df = Occupation_ratio_df.sort_values(['Ratio R/A'],ascending=False)
Occupation_ratio_df

In [22]:
# Occupation type and occupation based repayment to applicants ratio.
fig,ax = plt.subplots(figsize = (15,7))
sns.barplot(x='OCCUPATION_TYPE',y='Ratio R/A',data=Occupation_ratio_df,palette=sns.color_palette("GnBu_d"))
plt.xticks(rotation=70)
plt.xlabel("Occupation Type")
plt.ylabel("Mean R/A Ratio")

According to the ratio of Number of repayers to Number of applicants in every occupation type, we see that it is most safe to lend money to Accountants with an R/A ratio of 0.9516 and it is least safe to lend money to low skilled labourers with an R/A ratio of 0.8284

### **How is the distribution of males and females in terms of loan safety given that they belong to a specific occupation?**
**find the probabilities of repaying given a specific gender and a specific occupation type.**

In [23]:
# merge the new column 'Ratio R/A' to the train dataframe.
train = pd.merge(left=train,right=Occupation_ratio_df,on='OCCUPATION_TYPE')
train

In [24]:
fig,ax = plt.subplots(figsize = (15,7))
sns.countplot(x='CODE_GENDER',data=train,hue='TARGET',palette=sns.color_palette("GnBu_d"))
plt.xticks(rotation=70)
plt.xlabel("Gender")

In [25]:
# Find out what is the probability that an applicant will return given that he/she is a male/Female respectively.
pd.DataFrame(train.groupby(['CODE_GENDER','TARGET']).count()['SK_ID_CURR']).reset_index() 

In [26]:
### To find out the probability here's what we have to do:
print("probability that an applicant will repay the given that he is a male P(R|M): 73260/(73260+8576) = 0.8952") 
print("probability that an applicant will repay the given that she is a female P(R|F): 119311/(119311+9971) = 0.9228")

In [27]:
# Let us create a new dataframe where the probabilites of repaying based on gender is included. GR/A stands
# for Gender based repayment ratio.
gender_repay_ratio = pd.DataFrame({"CODE_GENDER":['M','F'],"GR/A":[0.8952,0.9228]})
gender_repay_ratio 

In [28]:
# Merge this dataframe with the old train dataframe
train = pd.merge(left=train,right=gender_repay_ratio,on='CODE_GENDER')
train

In [29]:
# lets create a new column that's indicative of repayment with gender and occupation type which is just the product of Ratio R/A with G R/A.
# EGR/A stands for employment gender repayment ratio.
train['EGR/A'] = train['Ratio R/A']*train['GR/A']

In [30]:
fig,ax = plt.subplots(figsize = (19,10))
plt.xticks(rotation=70)
sns.barplot(x='OCCUPATION_TYPE',y='EGR/A',hue='CODE_GENDER',data=train)
plt.legend(loc=1)


So, in every occupation type, females are more likely to repay the loan on time.

### **Which occupation category are the highest loan recipients?**

In [31]:
plt.figure(figsize=(12,10))
sns.boxplot(x='OCCUPATION_TYPE',y='AMT_CREDIT',data=train,hue='CODE_GENDER')
plt.xticks(rotation=70)

- Accountants and Managers are the highest amount recipents, while low skilled laborers are the least recipents (let me make it clear- labourers are highest volume based applicants, but not large recipents ). 
- It makes sense because accountants are more likely to get a large credit approved as opposed to low skilled laborers - which was kinda explained through Ratio R/A.