In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
data=pd.read_table(r'../input/XYZCorp_LendingData.txt',parse_dates=['issue_d'],low_memory=False)

In [None]:
print(data.shape)

In [None]:
data.tail()

In [None]:
print(data.info())

In [None]:

train = data[data['issue_d'] < '2015-6-01']
test = data[data['issue_d'] >= '2015-6-01']

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train['default_ind'].value_counts().plot.bar()

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(16,5))

sns.distplot(train['loan_amnt'], ax=ax[0])
sns.distplot(train['funded_amnt'], ax=ax[1])
sns.distplot(train['funded_amnt_inv'], ax=ax[2])

ax[1].set_title("Amount Funded by the Lender")
ax[0].set_title("Loan Applied by the Borrower")
ax[2].set_title("Total committed by Investors")

In [None]:
train.purpose.value_counts(ascending=False).plot.bar(figsize=(10,5))
plt.xlabel('purpose'); plt.ylabel('Density'); plt.title('Purpose of loan');

In [None]:

train['issue_year'] = train['issue_d'].dt.year
plt.figure(figsize=(10,5))
sns.barplot(x='issue_year',y='loan_amnt',data=train)

In [None]:
# Loan Status 
fig, ax = plt.subplots(1, 2, figsize=(16,5))
train['default_ind'].value_counts().plot.pie(explode=[0,0.25],labels=['good loans','bad loans'],
                                             autopct='%1.2f%%',startangle=70,ax=ax[0])
sns.kdeplot(train.loc[train['default_ind']==0,'issue_year'],label='default_ind = 0')
sns.kdeplot(train.loc[train['default_ind']==1,'issue_year'],label='default_ind = 1')
plt.xlabel('Year'); plt.ylabel('Density'); plt.title('Yearwise Distribution of defaulter')

In [None]:
train.grade.value_counts().plot.bar()

In [None]:
fig,array=plt.subplots(1,2,figsize=(12,5))
train.loc[train['default_ind']==0,'grade'].value_counts().plot.bar(ax=array[0])
train.loc[train['default_ind']==1,'grade'].value_counts().plot.bar(ax=array[1])
array[0].set_title('default_ind=0 vs grade'),array[1].set_title('default_ind=1 vs grade')

In [None]:
print(data.isnull().sum())

In [None]:
data.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

In [None]:
columns = data.columns
percent_missing = data.isnull().sum() * 100 / len(data)
missing_value_data = pd.DataFrame({'column_name': columns,
                                 'percent_missing': percent_missing})
print(missing_value_data)

In [None]:
data.drop(['inq_last_12m','total_cu_tl','inq_fi','all_util','max_bal_bc','open_rv_24m','open_rv_12m',
           'il_util','total_bal_il','mths_since_rcnt_il','open_il_24m','open_il_12m','open_il_6m',
           'open_acc_6m','verification_status_joint','dti_joint','annual_inc_joint','mths_since_last_major_derog',
           'mths_since_last_record','desc','title','zip_code','emp_title','earliest_cr_line','mths_since_last_delinq','last_pymnt_d','next_pymnt_d','last_credit_pull_d'],axis=1,inplace=True)

In [None]:
data.head(3)

In [None]:
print(data.isnull().sum())

In [None]:
print(data.info())

In [None]:
#Replacing NA with mode
data['emp_length'].fillna(data['emp_length'].mode()[0],inplace=True)#inplace true will make changes permanent

In [None]:
#Replacing NA with mode
colname1=['tot_coll_amt','tot_cur_bal','total_rev_hi_lim']
for x in colname1:
    data[x].fillna(data[x].mean(),inplace=True)#inplace true will make changes permanent

In [None]:
data.head()

In [None]:
data.dropna(axis=0,inplace=True)


In [None]:
print(data.isnull().sum())

In [None]:
from sklearn import preprocessing
colname=['term','grade','sub_grade','home_ownership','verification_status','pymnt_plan']
le={}

for x in colname:
    le[x]=preprocessing.LabelEncoder()
    
for x in colname:
    data[x]=le[x].fit_transform(data[x])

In [None]:
colname1=['addr_state','application_type']
for x in colname1:
    
    data[x] = pd.get_dummies(data[x])


In [None]:
data.shape


In [None]:
data.drop(['emp_length','purpose','initial_list_status'],axis=1,inplace=True)

In [None]:
data.head()

In [None]:
from sklearn.feature_selection import chi2
score,pvalues=chi(X,Y)
print(pvalues)

In [None]:
train_df = data[data['issue_d'] < '2015-6-01']
test_df = data[data['issue_d'] >= '2015-6-01']

In [None]:
data.drop('issue_d',inplace=True,axis=1)

In [None]:
len(train_df)

In [None]:
train_data=data.loc[0:len(train_df),:]
test_data=data.loc[len(train_df):,:]

In [None]:
X_train=train_data.values[:,:-1]
Y_train=train_data.values[:,-1]
X_test=test_data.values[:,:-1]
Y_test=test_data.values[:,-1]

In [None]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
#fitting training data to the model
classifier.fit(X_train,Y_train)

Y_pred=classifier.predict(X_test)
print(list(zip(Y_test,Y_pred)))

In [None]:
#confusion matrix
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
cfm=confusion_matrix(Y_test,Y_pred)
print(cfm)

print('classification report:')

print(classification_report(Y_test,Y_pred))
acc=accuracy_score(Y_test,Y_pred)
print("Accuracy of the model:",acc)


In [None]:
from sklearn.ensemble import RandomForestClassifier
m = RandomForestClassifier(n_jobs=-1,n_estimators=1000,min_samples_leaf=3)
m.fit(X_train,Y_train)

In [None]:

y_pred = m.predict(X_test)
print(confusion_matrix(Y_test,y_pred))
print(classification_report(Y_test,y_pred))
acc=accuracy_score(Y_test,y_pred)
print("Accuracy of the model:",acc)