In [2]:
import sklearn
import pandas as pd
import numpy as np
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold,cross_val_score
from sklearn.metrics import accuracy_score,roc_curve,confusion_matrix,precision_recall_curve,auc,roc_auc_score,recall_score,classification_report
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from collections import OrderedDict
from sklearn import svm
import random
from sklearn.model_selection import KFold
import missingno as msno
from IPython.display import display, Markdown
import pickle
from sklearn import metrics
from pandas import Series
from collections import defaultdict
from xgboost import XGBClassifier


def main_function():
    
    #cik = sys.argv[1]
    #accession = sys.argv[2]

    data_ingestion() 
       
    #close_program(sys.argv[3], sys.argv[4])

def data_ingestion():
    df=pd.read_csv('aps_failure_training_set.csv',error_bad_lines=False)
    df_test=pd.read_csv('aps_failure_test_set.csv',error_bad_lines=False)
    data_manipulation(df,df_test)

def data_manipulation(df,df_test):
    #Training Data Manipulation
    df = df.rename(columns = {'class' : 'Flag'})
    df['Flag'] = df.Flag.map({'neg':0, 'pos':1})
    df = df.replace(['na'],np.nan)
    df_X = df.loc[:,df.columns != 'Flag']
    df_Y = df.loc[:,df.columns == 'Flag']
    df_X = df_X.apply(pd.to_numeric)
    df_X= df_X.fillna(df_X.median()).dropna(axis =1 , how ='all')
    scaler = StandardScaler()
    scaler.fit(df_X)
    df_X = scaler.transform(df_X)
    pca = PCA(0.95)
    pca.fit(df_X)
    pca_n=pca.n_components_
    df_X = pca.transform(df_X)
    df_X= pd.DataFrame(df_X)
    
    #Testing Data Manipulation
    df_test = df_test.rename(columns = {'class' : 'Flag'})
    df_test = df_test.replace(['na'],[np.NaN])
    df_test['Flag'] = df_test.Flag.map({'neg':0, 'pos':1})
    df_test_X = df_test.loc[:,df_test.columns != 'Flag']
    df_test_Y = df_test.loc[:,df_test.columns == 'Flag']
    df_test_X = df_test_X.apply(pd.to_numeric)
    df_test_X= df_test_X.fillna(df_test_X.median()).dropna(axis =1 , how ='all')
    scaler.fit(df_test_X)
    df_test_X = scaler.transform(df_test_X)
    pca = PCA(pca_n)
    pca.fit(df_test_X)
    pca.n_components_
    df_test_X = pca.transform(df_test_X)
    df_test_X= pd.DataFrame(df_test_X)
    
    X_train,X_validation,Y_train,Y_validation = train_test_split(df_X,df_Y,test_size = 0.2,random_state = 0)
    DF = pd.concat([X_train,Y_train],axis = 1)
    fun_undersampling(DF,X_validation,Y_validation,df_test_X,df_test_Y)

def fun_undersampling(DF,X_validation,Y_validation,df_test_X,df_test_Y):

    numberofrecords_pos = len(DF[DF.Flag == 1])
    pos_indices = np.array(DF[DF.Flag == 1].index)
    #Picking the indices of the normal class
    neg_indices = DF[DF.Flag == 0].index
    #out of indices selected, randomly select "x" number of records
    random_neg_indices = np.random.choice(neg_indices, numberofrecords_pos, replace = False)
    random_neg_indices =np.array(random_neg_indices)
    #Appending the two indices
    under_sample_indices = np.concatenate([pos_indices,random_neg_indices])
    #Undersample dataset
    under_sample_data = DF.loc[under_sample_indices,:]
    X_undersample = under_sample_data.loc[:,under_sample_data.columns != 'Flag']
    Y_undersample = under_sample_data.loc[:,under_sample_data.columns == 'Flag']
    
    fun_MachineLeaarningModels(X_undersample,Y_undersample,X_validation,Y_validation,df_test_X,df_test_Y)
    
    

# Function to print and add metrics to dataframe
def print_metrics(df, model, recall_train,recall_test,cost):
    df[model] = [float("{0:.5f}".format(recall_train)), float("{0:.5f}".format(recall_test)),cost]
    return df

    # Function to print and add metrics to dataframe
def print_metrics_copy(df1, model, recall_train,recall_test,cost,model_var):
    df1[model] = [float("{0:.5f}".format(recall_train)), float("{0:.5f}".format(recall_test)),cost,model_var]
    return df1
        
def fun_MachineLeaarningModels(X_undersample,Y_undersample,X_validation,Y_validation,df_test_X,df_test_Y):

    metrics_df = pd.DataFrame(index = ['Metrics_Train','Metrics_Test','Cost'])
    metrics_df_copy = pd.DataFrame(index = ['Metrics_Train','Metrics_Test','Cost','Model_Var'])
    
    #Logistic Regression
    print('Logistic Regression')
    lr_train = LogisticRegression(C =0.1,penalty = 'l2')
    lr_train.fit(X_undersample,Y_undersample.values.ravel())
    y_pred_train = lr_train.predict(X_validation)
    recall_train_lr = roc_auc_score(Y_validation,y_pred_train)
    
    lr_test = LogisticRegression(C =0.001,penalty = 'l2')
    lr_fit=lr_test.fit(X_undersample,Y_undersample.values.ravel())
    y_pred_test = lr_test.predict(df_test_X)
    recall_test_lr=roc_auc_score(df_test_Y,y_pred_test)
    
    cm = confusion_matrix(df_test_Y,y_pred_test).ravel()
    cm = pd.DataFrame(cm.reshape((1,4)), columns=['TN', 'FP', 'FN', 'TP'])
    
    total_cost = 10*cm.FP + 500*cm.FN
    
    metrics_df = print_metrics(metrics_df, 'LR_Model', recall_train_lr, recall_test_lr,total_cost[0])
    metrics_df_copy = print_metrics_copy(metrics_df_copy, 'LR_Model', recall_train_lr, recall_test_lr,total_cost[0],lr_test)
    print(metrics_df)
    
    #Random Forest
    print('Random Forest')
    clf_train = RandomForestClassifier(n_estimators=80,max_features= 'log2',oob_score =True)
    rf_fit=clf_train.fit(X_undersample,Y_undersample.values.ravel())
    y_pred_train_rf = clf_train.predict(X_validation)
    recall_train_rf=roc_auc_score(Y_validation,y_pred_train_rf)
            
    clf_test = RandomForestClassifier(n_estimators=15,max_features= 'sqrt',oob_score =True)
    rf_fit=clf_test.fit(X_undersample,Y_undersample.values.ravel())
    y_pred_test_rf = clf_test.predict(df_test_X)
    recall_test_rf=roc_auc_score(df_test_Y,y_pred_test_rf)
    
    cm = confusion_matrix(df_test_Y,y_pred_test_rf).ravel()
    cm = pd.DataFrame(cm.reshape((1,4)), columns=['TN', 'FP', 'FN', 'TP'])
    
    total_cost = 10*cm.FP + 500*cm.FN
    
    metrics_df = print_metrics(metrics_df, 'RF_Model', recall_train_rf, recall_test_rf,total_cost[0])
    metrics_df_copy = print_metrics_copy(metrics_df_copy, 'RF_Model', recall_train_rf, recall_test_rf,total_cost[0],clf_test)
    print(metrics_df)
    
    #Support Vector Machine
    print('Support Vector Machine')
    svm_train = svm.SVC(C =0.1,gamma = 0.01, kernel = 'sigmoid')
    svm_fit=svm_train.fit(X_undersample,Y_undersample)
    y_pred_train_svm = svm_train.predict(X_validation)
    recall_train_svm=roc_auc_score(Y_validation,y_pred_train_svm)
    
    svm_test = svm.SVC(C =0.01,gamma = 0.01, kernel = 'sigmoid')
    svm_fit=svm_test.fit(X_undersample,Y_undersample)
    y_pred_test_svm = svm_test.predict(df_test_X)
    recall_test_svm=roc_auc_score(df_test_Y,y_pred_test_svm)
    
    cm = confusion_matrix(df_test_Y,y_pred_test_svm).ravel()
    cm = pd.DataFrame(cm.reshape((1,4)), columns=['TN', 'FP', 'FN', 'TP'])
    
    total_cost = 10*cm.FP + 500*cm.FN
    
    metrics_df = print_metrics(metrics_df, 'SVM_Model', recall_train_svm, recall_test_svm,total_cost[0])
    metrics_df_copy = print_metrics_copy(metrics_df_copy, 'SVM_Model', recall_train_svm, recall_test_svm,total_cost[0],svm_test)
    print(metrics_df)
    
    #XG Boost
    print('XG Boost')
    xgb=XGBClassifier()
    xgb_fit=xgb.fit(X_undersample,Y_undersample.values.ravel())
    y_pred_train_xgb = xgb.predict(X_validation)
    recall_train_xgb=roc_auc_score(Y_validation,y_pred_train_xgb)
    
    y_pred_test_xgb = xgb.predict(df_test_X)
    recall_test_xgb=roc_auc_score(df_test_Y,y_pred_test_xgb)
    
    cm = confusion_matrix(df_test_Y,y_pred_test_xgb).ravel()
    cm = pd.DataFrame(cm.reshape((1,4)), columns=['TN', 'FP', 'FN', 'TP'])
    
    total_cost = 10*cm.FP + 500*cm.FN
    
    # Printing the training and testing metrices
    metrics_df = print_metrics(metrics_df, 'XGB_Model', recall_train_xgb, recall_test_xgb,total_cost[0])
    metrics_df_copy = print_metrics_copy(metrics_df_copy, 'XGB_Model', recall_train_svm, recall_test_svm,total_cost[0],xgb)
    print(metrics_df)

    metrics_df=metrics_df.T
    metrics_df_copy=metrics_df_copy.T
    
    #Ranking of the model
    metrics_df['Model_Rank'] = metrics_df['Cost'].rank(ascending=True,method='max')
    metrics_df['Model_Rank']=metrics_df['Model_Rank'].astype(int)
    metrics_df['Model_Name']=['Logistic Regression','Random Forest','Support Vector Machine','XGB']
    metrics_df_copy['Model_Rank'] = metrics_df_copy['Cost'].rank(ascending=True,method='max')
    metrics_df_copy['Model_Rank']=metrics_df_copy['Model_Rank'].astype(int)
    store_result_csv(metrics_df)
    fun_pickle(metrics_df,metrics_df_copy)
    return metrics_df

def store_result_csv(metrics):
    metrics_df=metrics
    metrics_df.to_csv('Metrics_Score.csv', index=False)
    print('Metrics_Score file uploaded')
    
def fun_pickle(metrics_df,metrics_df_copy):
    
    dictionary1=pd.Series(metrics_df.Model_Name.values,index=metrics_df.Model_Rank).to_dict()
    dictionary2=pd.Series(metrics_df_copy.Model_Var.values,index=metrics_df_copy.Model_Rank).to_dict()
    
    
    ds = [dictionary1, dictionary2]
    
    dictionary = {}
    for k in dictionary1.keys():
        dictionary[k] = tuple(dictionary[k] for dictionary in ds)
    
    filename = 'finalized_model.pkl'
    pickle.dump(dictionary, open(filename, 'wb'))
    print('Pickle file uploaded ')
    

if __name__ == '__main__':
    main_function()

Logistic Regression
                  LR_Model
Metrics_Train      0.94397
Metrics_Test       0.93257
Cost           21740.00000
Random Forest


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  y = column_or_1d(y, warn=True)


                  LR_Model     RF_Model
Metrics_Train      0.94397      0.94586
Metrics_Test       0.93257      0.90957
Cost           21740.00000  28760.00000
Support Vector Machine
                  LR_Model     RF_Model    SVM_Model
Metrics_Train      0.94397      0.94586      0.84221
Metrics_Test       0.93257      0.90957      0.90548
Cost           21740.00000  28760.00000  30120.00000
XG Boost


  if diff:
  if diff:


                  LR_Model     RF_Model    SVM_Model    XGB_Model
Metrics_Train      0.94397      0.94586      0.84221      0.94417
Metrics_Test       0.93257      0.90957      0.90548      0.93475
Cost           21740.00000  28760.00000  30120.00000  21640.00000
Metrics_Score file uploaded
Pickle file uploaded 
