In [1]:
import pandas as pd 
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

import sys
import warnings
warnings.filterwarnings("ignore")

# to visualise al the columns in the dataframe
pd.pandas.set_option('display.max_columns', None)

#### Loading data

In [31]:
application_train = pd.read_csv('cleandata.csv')

In [32]:
#Determine the number of fraud and valid transactions in the dataset

Fraud = application_train[application_train['TARGET']==1]

Valid = application_train[application_train['TARGET']==0]

outlier_fraction = len(Fraud)/float(len(Valid))

print('outlier_fraction for the whole dataset:')
print(outlier_fraction)

print("Fraud Cases : {}".format(len(Fraud)))

print("Valid Cases : {}".format(len(Valid)))

outlier_fraction for the whole dataset:
0.14670771921944992
Fraud Cases : 24825
Valid Cases : 169214


In [33]:
from sklearn.model_selection import train_test_split

#Create independent and Dependent Features
columns = application_train.columns.tolist()
# Filter the columns to remove data we do not want 
columns = [c for c in columns if c not in ["TARGET"]]
# Store the variable we are predicting 
target = "TARGET"
# Define a random state 
state = np.random.RandomState(42)
X = application_train[columns]
y = application_train[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(155231, 210) (155231,)
(38808, 210) (38808,)


#### Feature Selection


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [35]:
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100, random_state=state))
sel.fit(X_train, y_train)

In [36]:
sel.get_support()

array([ True, False, False, False,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False, False, False,
       False,  True,  True,  True, False, False, False, False, False,
       False,  True,  True,  True,  True,  True,  True,  True,  True,
       False,  True,  True, False,  True, False,  True, False,  True,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [37]:
selected_feat= X_train.columns[(sel.get_support())].tolist()
len(selected_feat)

33

In [38]:
print(selected_feat)

['SK_ID_CURR', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY', 'REGION_POPULATION_RELATIVE', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'OWN_CAR_AGE', 'CNT_FAM_MEMBERS', 'REGION_RATING_CLIENT', 'HOUR_APPR_PROCESS_START', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'APARTMENTS_AVG', 'BASEMENTAREA_AVG', 'YEARS_BEGINEXPLUATATION_AVG', 'YEARS_BUILD_AVG', 'COMMONAREA_AVG', 'ENTRANCES_AVG', 'FLOORSMAX_AVG', 'LANDAREA_AVG', 'NONLIVINGAREA_AVG', 'OBS_30_CNT_SOCIAL_CIRCLE', 'DAYS_LAST_PHONE_CHANGE', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'CREDIT_INCOME_PERCENT', 'ANNUITY_INCOME_PERCENT', 'CREDIT_TERM', 'DAYS_EMPLOYED_PERCENT']


In [39]:
app_X_train = X_train.copy() #Taking a copy before dropping 
app_X_test = X_test.copy()

In [40]:
X_train = X_train[selected_feat]
X_test = X_test[selected_feat]

In [41]:
print (X_train.shape, X_test.shape)

(155231, 33) (38808, 33)


### Logistic Regression

In [42]:
from sklearn.linear_model import LogisticRegression

logistic_regressor = LogisticRegression(C = 2)
logistic_regressor.fit(X_train,y_train)
y_pred = logistic_regressor.predict(X_test)

In [43]:
from sklearn.metrics import classification_report,accuracy_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix

n_errors = (y_pred != y_test).sum()
# Run Classification Metrics
print("{}: {}".format("Logistic Regression errors",n_errors))
print("Accuracy Score :")
print(accuracy_score(y_test,y_pred))
print("Confusion matrix :")
print(confusion_matrix(y_test, y_pred))
print("Classification Report :")
print(classification_report(y_test,y_pred))
print("ROC AUC score is: ",roc_auc_score(y_test,y_pred))

Logistic Regression errors: 4738
Accuracy Score :
0.8779117707689136
Confusion matrix :
[[33456   294]
 [ 4444   614]]
Classification Report :
              precision    recall  f1-score   support

           0       0.88      0.99      0.93     33750
           1       0.68      0.12      0.21      5058

    accuracy                           0.88     38808
   macro avg       0.78      0.56      0.57     38808
weighted avg       0.86      0.88      0.84     38808

ROC AUC score is:  0.5563403716884144


### Random forest

In [15]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators = 100, random_state = state, verbose = 1, n_jobs = -1)
random_forest.fit(X_train,y_train)
y_pred = random_forest.predict(X_test) 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   19.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   53.6s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


In [17]:
n_errors = (y_pred != y_test).sum()
# Run Classification Metrics
print("{}: {}".format("Random Forest errors",n_errors))
print("Accuracy Score :")
print(accuracy_score(y_test,y_pred))
print("Confusion matrix :")
print(confusion_matrix(y_test, y_pred))
print("Classification Report :")
print(classification_report(y_test,y_pred))
print("ROC AUC score is: ",roc_auc_score(y_test,y_pred))

Random Forest errors: 4899
Accuracy Score :
0.9203453490073655
Confusion matrix :
[[56581    14]
 [ 4885    23]]
Classification Report :
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56595
           1       0.62      0.00      0.01      4908

    accuracy                           0.92     61503
   macro avg       0.77      0.50      0.48     61503
weighted avg       0.90      0.92      0.88     61503

ROC AUC score is:  0.5022194274464621


### Extreme Gradient Boost Model

In [18]:
from xgboost import XGBClassifier
xgb_classifier = XGBClassifier(n_estimators=100,max_depth=5)
xgb_classifier.fit(X_train,y_train)
y_pred = xgb_classifier.predict(X_test)

In [19]:
n_errors = (y_pred != y_test).sum()
# Run Classification Metrics
print("{}: {}".format("Extreme Gradient Boost errors",n_errors))
print("Accuracy Score :")
print(accuracy_score(y_test,y_pred))
print("Confusion matrix :")
print(confusion_matrix(y_test, y_pred))
print("Classification Report :")
print(classification_report(y_test,y_pred))
print("ROC AUC score is: ",roc_auc_score(y_test,y_pred))

Extreme Gradient Boost errors: 4895
Accuracy Score :
0.9204103864852121
Confusion matrix :
[[56484   111]
 [ 4784   124]]
Classification Report :
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56595
           1       0.53      0.03      0.05      4908

    accuracy                           0.92     61503
   macro avg       0.72      0.51      0.50     61503
weighted avg       0.89      0.92      0.89     61503

ROC AUC score is:  0.5116517848367557


### Saving the final model

In [44]:
import pickle

In [45]:
with open('logistic_regressor.pkl', 'wb') as file:
    pickle.dump(logistic_regressor, file)
with open('logistic_regressor.pkl', 'rb') as file:
    loaded_model = pickle.load(file)
    
y_pred_loaded = loaded_model.predict(X_test)
y_pred_loaded

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [46]:
df = pd.DataFrame(y_pred_loaded)
df.to_csv('output.csv')