In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [14]:
from pyspark.sql import Row
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

In [15]:
df = sqlContext.read.csv('/home/abhay/Downloads/MyProjects/loan-default-prediction/train_v2.csv', header=True, inferSchema= True)

In [None]:
df = pd.read_csv('/home/abhay/Downloads/MyProjects/loan-default-prediction/train_v2.csv')

In [None]:
# Droped all the object type columns because they were conntaining some very large data values
# which must be outlier or unintended data columns for our modelling.

obj_cols = df.columns[df.dtypes == 'object']
df = df.drop(list(obj_cols),axis=1)

# Drop columns which contains sinngle value so, they don't have any value
for i in df.columns:
    if len(set(df[i]))==1:
        df.drop(labels=[i], axis=1, inplace=True)

In [None]:
df.loc[df['loss']!=0,'loss']=1
y = df['loss']
ids = df['id']
predictors = df.drop(['loss','id'],axis=1)
cols = list(predictors.columns.values)

In [None]:
# Calculate the correlation matrix
corr_matrix = predictors.corr()
iters = range(len(corr_matrix.columns) - 1)
drop_cols = []

# Iterate through the correlation matrix and compare correlations
for i in iters:
    for j in range(i):
        item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
        col = item.columns
        row = item.index
        val = abs(item.values)

        # If correlation exceeds the threshold
        if val >= 0.6:
            # Print the correlated features and the correlation value
            #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
            drop_cols.append(col.values[0])

# Drop one of each pair of correlated columns
drops = set(drop_cols)
predictors = predictors.drop(columns = drops)
predictors.shape

In [None]:
features = list(predictors.columns.values)

In [None]:
## Handling Missing Data using various techniques 

#1. Dropping the missing values
#df = df.dropna(axis=0)

#2. Imputing the meand/median values
# imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
# #imp = Imputer(missing_values=0, strategy='mean')
# cleaned_predictors = pd.DataFrame(imputer.fit_transform(predictors),columns = cols)

#3.Imputing using knn 

# from sklearn.impute import KNNImputer
# # start the KNN training
# imputer = KNNImputer(missing_values=np.nan,n_neighbors = 5)
# f = imputer.fit(x_train)
# g = f.transform(x_test)

#4. 
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(random_state=0,missing_values=np.nan, n_nearest_features=5)
cleaned_predictors = pd.DataFrame(imputer.fit_transform(predictors),columns = features)

#x.isnull().sum(axis=0)

In [None]:
cleaned_predictors.to_csv('/home/abhay/features.csv')

In [None]:
x_train,x_test,y_train,y_test = train_test_split(cleaned_predictors,y,test_size=0.2,random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(copy=False)
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
pca = PCA(n_components = x_train.shape[1])
#transformed_predictors = pca.fit_transform(cleaned_predictors)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)

In [None]:
arr = pca.explained_variance_ratio_
s = 0
for i in arr:
    s+=i
print(s)

In [None]:

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = [
         "Nearest Neighbors",
         "Decision Tree",
         "Random Forest", 
         "Neural Net", 
         "AdaBoost",
         "Naive Bayes", 
         "QDA"
        ]

classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5,min_samples_leaf=100),
    RandomForestClassifier(max_depth=5, n_estimators=10),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
   ]

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)
    print('Classification score for '+ name +' Algo is :- '+str(score))

In [None]:
# #Decision Tree Classifier
# from sklearn.tree import DecisionTreeClassifier
# params = {'criterion': ['gini', 'entropy'],'max_depth': np.arange(5,10), 'min_samples_leaf':np.arange(100,500,50),
#           'max_features' :['auto', 'sqrt', 'log2']}
# dtc = DecisionTreeClassifier()
# grid = GridSearchCV(estimator = dtc,cv = 7, param_grid=params, refit=True)
# grid.fit(x_train,y_train)
# print(grid.best_score_)
# print(grid.best_estimator_)

# Accuracy :- 0.9072446459035366
#Note :- Can't chooose this model as this is overfitting the data so not performing well on unseen data.

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

params = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid = GridSearchCV(estimator = rfc,cv = 5, param_grid=params, refit=True)
grid.fit(x_train,y_train)
print(grid.best_score_)
print(grid.best_estimator_)

In [None]:
model = grid.best_estimator_

In [None]:
import pickle
# save the model to disk
filename = '/home/abhay/finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
loaded_model.score(x_test,y_test)

In [None]:
loaded_model.feature_importances_

In [None]:
df1 = pd.read_csv('/home/abhay/Downloads/MyProjects/loan-default-prediction/test_v2.csv', usecols = features)

In [None]:
from sklearn.experimental import enable_iterative_imputer  # noqa
# now you can import normally from sklearn.impute
from sklearn.impute import IterativeImputer
missing_imputer = IterativeImputer(random_state=0,missing_values=np.nan, n_nearest_features=5)

#from sklearn.impute import SimpleImputer
# missing_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
test_predictors = pd.DataFrame(missing_imputer.fit_transform(df1),columns = features)

In [None]:
# from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(copy=False)
#sc.fit_transform(test_predictors)
test_predictors = scaler.fit_transform(test_predictors)

In [None]:
pca = PCA(n_components = len(features)) 
transformed_test = pca.fit_transform(test_predictors)

In [None]:
ar = pca.explained_variance_ratio_
s = 0
for i in ar:
    s+=i
print(s)

#test_predictors.isnull().sum(axis=0)

In [None]:
ty = loaded_model.predict(transformed_test)