## Starter Code


In [2]:
import pandas as pd

In [4]:
data_info = pd.read_csv('lending_club_info.csv',index_col='LoanStatNew')

In [None]:
print(data_info.loc['revol_util']['Description'])

In [82]:
def feat_info(col_name):
    print(data_info.loc[col_name]['Description'])

In [None]:
feat_info('mort_acc')

## Loading the data and other imports

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# might be needed depending on your version of Jupyter
%matplotlib inline

In [86]:
df = pd.read_csv('lending_club_loan_two.csv')

In [None]:
df.info()


# Section 1: Exploratory Data Analysis

----

**TASK: Since we will be attempting to predict loan_status, create a countplot as shown below.**

In [None]:
sns.countplot(x='loan_status',data=df)

**TASK: Create a histogram of the loan_amnt column.**

In [None]:
plt.figure(figsize=(12,4))
sns.histplot(df['loan_amnt'],kde=False,bins=40)
plt.xlim(0,45000)

**TASK: Let's explore correlation between the continuous feature variables. Calculate the correlation between all continuous numeric variables using .corr() method.**

In [None]:
df.corr(numeric_only=True)

**TASK: Visualize this using a heatmap.**

In [None]:
plt.figure(figsize=(12,7))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='viridis')
plt.ylim(10, 0)

**TASK: You should have noticed almost perfect correlation with the "installment" feature. Explore this feature further. Print out their descriptions and perform a scatterplot between them. Does this relationship make sense to you? Do you think there is duplicate information here?**

In [17]:
# CODE HERE

In [None]:
feat_info('installment')

In [None]:
feat_info('loan_amnt')

In [None]:
sns.scatterplot(x='installment',y='loan_amnt',data=df,)

**TASK: Create a boxplot showing the relationship between the loan_status and the Loan Amount.**

In [21]:
# CODE HERE

In [None]:
sns.boxplot(x='loan_status',y='loan_amnt',data=df)

**TASK: Calculate the summary statistics for the loan amount, grouped by the loan_status.**

In [23]:
# CODE HERE

In [None]:
df.groupby('loan_status')['loan_amnt'].describe()

**TASK: explore the Grade and SubGrade columns that LendingClub attributes to the loans. What are the unique possible grades and subgrades?**

In [None]:
sorted(df['grade'].unique())

In [None]:
sorted(df['sub_grade'].unique())

**TASK: Create a countplot per grade. Set the hue to the loan_status label.**

In [None]:
sns.countplot(x='grade',data=df,hue='loan_status')

**TASK: Display a count plot per subgrade. You may need to resize for this plot and reorder the x axis.**

In [None]:
plt.figure(figsize=(12,4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade',data=df,order = subgrade_order,palette='coolwarm' )

In [None]:
plt.figure(figsize=(12,4))
subgrade_order = sorted(df['sub_grade'].unique())
sns.countplot(x='sub_grade',data=df,order = subgrade_order,palette='coolwarm' ,hue='loan_status')

**TASK: It looks like F and G subgrades don't get paid back that often. Isloate those and recreate the countplot just for those subgrades.**

In [None]:
f_and_g = df[(df['grade']=='G') | (df['grade']=='F')]

plt.figure(figsize=(12,4))
subgrade_order = sorted(f_and_g['sub_grade'].unique())
sns.countplot(x='sub_grade',data=f_and_g,order = subgrade_order,hue='loan_status')

**TASK: Create a new column called 'load_repaid' which will contain a 1 if the loan status was "Fully Paid" and a 0 if it was "Charged Off".**

In [35]:
# CODE HERE

In [None]:
df['loan_status'].unique()

In [92]:
df['loan_repaid'] = df['loan_status'].map({'Fully Paid':1,'Charged Off':0})

In [None]:
df[['loan_repaid','loan_status']]

**Create a bar plot showing the correlation of the numeric features to the new loan_repaid column.**

In [39]:
#CODE HERE

In [None]:
df.corr()['loan_repaid'].sort_values().drop('loan_repaid').plot(kind='bar')

---
---
# Data PreProcessing

**Remove or fill any missing data. Convert categorical string features to dummy variables.**



In [None]:
df.head()

# Missing Data

**What is the length of the dataframe?**

In [None]:
len(df)

**Create a Series that displays the total count of missing values per column.**

In [44]:
# CODE HERE

In [None]:
df.isnull().sum()

**TASK: Convert this Series to be in term of percentage of the total DataFrame**

In [None]:
100* df.isnull().sum()/len(df)

**Let's examine emp_title and emp_length to see whether it will be okay to drop them. Print out their feature information using the feat_info() function**

In [None]:
feat_info('emp_title')
print('\n')
feat_info('emp_length')

**How many unique employment job titles are there?**

In [None]:
df['emp_title'].nunique()

In [None]:
df['emp_title'].value_counts()

**There are too many unique job titles to try to convert this to a dummy variable feature.**

In [102]:
df = df.drop('emp_title',axis=1)

In [104]:
df = df.drop('emp_length',axis=1)

**what feature columns still have missing data.**

In [None]:
df.isnull().sum()

**Review the title column vs the purpose column.**

In [None]:
df['purpose'].head(10)

In [None]:
df['title'].head(10)

**The title column is simply a string subcategory/description of the purpose column. Go ahead and drop the title column.**

In [110]:
df = df.drop('title',axis=1)

In [112]:
df = df.drop('mort_acc',axis=1)

In [114]:
df = df.dropna()

In [None]:
df.isnull().sum()

## Categorical Variables and Dummy Variables

In [None]:
df.select_dtypes(['object']).columns

---
**Let's now go through all the string features to see what we should do with them.**

---


### term feature

**TASK: Convert the term feature into either a 36 or 60 integer numeric data type using .apply() or .map().**

In [None]:
df['term'].value_counts()

In [120]:
# Or just use .map()
df['term'] = df['term'].apply(lambda term: int(term[:3]))

### grade feature

In [122]:
df = df.drop('grade',axis=1)

In [124]:
subgrade_dummies = pd.get_dummies(df['sub_grade'],drop_first=True)

In [126]:
df = pd.concat([df.drop('sub_grade',axis=1),subgrade_dummies],axis=1)

In [None]:
df.columns

In [None]:
df.select_dtypes(['object']).columns

### verification_status, application_type,initial_list_status,purpose

In [130]:
dummies = pd.get_dummies(df[['verification_status', 'application_type','initial_list_status','purpose' ]],drop_first=True)
df = df.drop(['verification_status', 'application_type','initial_list_status','purpose'],axis=1)
df = pd.concat([df,dummies],axis=1)

### home_ownership
**TASK:Review the value_counts for the home_ownership column.**

In [None]:
df['home_ownership'].value_counts()

In [132]:
df['home_ownership']=df['home_ownership'].replace(['NONE', 'ANY'], 'OTHER')

dummies = pd.get_dummies(df['home_ownership'],drop_first=True)
df = df.drop('home_ownership',axis=1)
df = pd.concat([df,dummies],axis=1)

### address

In [134]:
df['zip_code'] = df['address'].apply(lambda address:address[-5:])

In [136]:
dummies = pd.get_dummies(df['zip_code'],drop_first=True)
df = df.drop(['zip_code','address'],axis=1)
df = pd.concat([df,dummies],axis=1)

### issue_d 

In [138]:
df = df.drop('issue_d',axis=1)

### earliest_cr_line

In [140]:
df['earliest_cr_year'] = df['earliest_cr_line'].apply(lambda date:int(date[-4:]))
df = df.drop('earliest_cr_line',axis=1)

In [None]:
df.select_dtypes(['object']).columns

## Train Test Split

**TASK: Import train_test_split from sklearn.**

In [144]:
from sklearn.model_selection import train_test_split

**TASK: drop the load_status column we created earlier, since its a duplicate of the loan_repaid column. We'll use the loan_repaid column since its already in 0s and 1s.**

In [146]:
df = df.drop('loan_status',axis=1)

**TASK: Set X and y variables to the .values of the features and label.**

In [148]:
X = df.drop('loan_repaid',axis=1).values
y = df['loan_repaid'].values

In [None]:
# df = df.sample(frac=0.1,random_state=101)
print(len(df))

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=101)

## Normalizing the Data

In [154]:
from sklearn.preprocessing import MinMaxScaler

In [156]:
scaler = MinMaxScaler()

In [158]:
X_train = scaler.fit_transform(X_train)

In [160]:
X_test = scaler.transform(X_test)

# Creating the Model

**TASK: Run the cell below to import the necessary Keras functions.**

In [162]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.constraints import max_norm

**Build a sequential model to will be trained on the data**

In [163]:
model = Sequential()


# input layer
model.add(Dense(78,  activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(39, activation='relu'))
model.add(Dropout(0.2))

# hidden layer
model.add(Dense(19, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(units=1,activation='sigmoid'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam')

**TASK: Fit the model to the training data for at least 25 epochs. Also add in the validation data for later plotting. Optional: add in a batch_size of 256.**

In [None]:
model.fit(x=X_train, 
          y=y_train, 
          epochs=25,
          batch_size=256,
          validation_data=(X_test, y_test), 
          )

# Evaluating Model Performance.

**TASK: Plot out the validation loss versus the training loss.**

In [170]:
losses = pd.DataFrame(model.history.history)

In [None]:
losses[['loss','val_loss']].plot()

In [174]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
predictions = model.predict_classes(X_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
confusion_matrix(y_test,predictions)

In [None]:
import random
random.seed(101)
random_ind = random.randint(0,len(df))

new_customer = df.drop('loan_repaid',axis=1).iloc[random_ind]
new_customer

In [None]:
model.predict_classes(new_customer.values.reshape(1,77))

In [None]:
df.iloc[random_ind]['loan_repaid']