# Custom Preprocessing Functions

In [None]:
import pandas as pd
import numpy as np

### Percentage of missing values per column

In [None]:
def missing_percent(df):
    missing = df.isnull().sum().sort_values(ascending=False)
    percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([missing, percent], axis=1, keys=['Missing', 'Percent'])
    return missing_data

### Remove columns if its missing percentage is above a given value

In [None]:
def filter_missing_columns(df,percent):
    missing_data=missing_percent(df)
    remaining_colums=missing_data[missing_data['Percent']<percent]
    #print('{} are removed'.format(missing_data[missing_data['Percent']>=percent]))
    a = remaining_colums.index
    return df.loc[:,a]

### Maximum percentage of records in a single category

In [1]:
# Any fields that exceed the specified maximum percent are eliminated
def single_value_elimination(df,percent):
    modes = df.mode().iloc[0,:]
    remaining_columns=[]
    for i in range(len(df.columns)):
        count = 0
        for j in range(len(df)):
            if df.iloc[j, i] == modes[i]:
                count = count + 1
        p = (count / len(df))
        if p<=percent: remaining_columns.append(df.mode().iloc[0,:].index[i])
    return df[remaining_columns]

### Eliminate outlier values of a single variable by IQR 

In [None]:
def outliers_iqr(ys):
    quartile_1, quartile_3 = np.percentile(ys, [25, 75])
    iqr = quartile_3 - quartile_1
    lower_bound = quartile_1 - (iqr * 1.5)
    upper_bound = quartile_3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))

###  Print # of outliers

In [None]:
# outlier if falls outside of 1.5 times of an interquartile range above the 3rd quartile and below the 1st quartile
def print_num_outliers_IQR(df):
    Q1 = df.quantile(0.25)
    Q3 = df.quantile(0.75)
    IQR = Q3 - Q1
    b=np.where((df > (Q3 + 1.5 * IQR)) | (df < (Q1 - 1.5 * IQR)))
    print('Total # of outliers: ', len(b[0]))
    print('# of rows containing outliers: ',len(np.unique(b[0])),'(%.2f' %(len(np.unique(b[0]))/len(df)),' of data)')

### Print boundary of outliers and row indexes 

In [None]:
def iqr_bounds_outlier_indexes(df):
    bounds = pd.DataFrame(columns=['Variable','Lower_Bound','Upper_Bound'])
    iqr_row_indexes = []
    for column in df.columns:
        quartile_1, quartile_3 = np.percentile(df[column], [25, 75])
        iqr = quartile_3 - quartile_1
        lower_bound = quartile_1 - (iqr * 1.5)
        upper_bound = quartile_3 + (iqr * 1.5)    
        bounds.loc[column] = [column, lower_bound, upper_bound]
        iqr_row_indexes.extend(list(np.where((df[column] > upper_bound) | (df[column] < lower_bound))[0]))
    iqr_row_indexes = list((set(iqr_row_indexes)))
    iqr_row_indexes.sort(key=None, reverse=False)
    return bounds, iqr_row_indexes

### Feature Importance by ExtraTreesClassifier

In [None]:
def feature_importance(x,y):
    from sklearn.ensemble import ExtraTreesClassifier
    model = ExtraTreesClassifier()
    y = y.values.ravel()
    model.fit(x, y)
    return model.feature_importances_

###  Feature Selection by Recursive Feature Elimination

In [None]:
#Recursive Feature Elimination: Works by recursively removing attributes and building a model on those attributes that remain.
#It uses the model accuracy to identify which attributes (and combination of attributes) contribute the most 
#to predicting the target attribute.
def RFE(x,y,num_var): 
    from sklearn.feature_selection import RFE
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression()
    rfe = RFE(model, num_var) # You can see that RFE chose the the top num_var features
    #y = np.array(y).reshape(-1,1)
    y = y.values.ravel()
    fit = rfe.fit(x, y)
    features_num=fit.n_features_
    r_square=fit.score(x,y)
    var_rank=fit.ranking_ # These are marked True in the support_ array and marked with a choice 1 in the ranking_ array.
    var_selected=fit.support_ # These are marked True in the support_ array and marked with a choice 1 in the ranking_ array.
    return features_num, r_square, var_selected,var_rank

###  Feature Selection by SelectKBest

In [None]:
def univariate_selection (x,y,num_var):
    #Statistical tests can be used to select those features that have the strongest relationship with the output variable
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    test = SelectKBest(score_func=chi2,k=num_var)
    fit = test.fit(x,y)
    chi_square=fit.scores_
    features = fit.transform(x)
    #ns_df = pd.DataFrame(chi_square, features, columns=['Feat_names', 'F_Scores'])
    selected_cols = x.columns[test.get_support()]
    return chi_square, features, selected_cols

### Principle Component Analysis

In [None]:
def PCA(x,y,x_test,num_var):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=num_var)
    fit = pca.fit(x)                    #PCA için mutlaka scale yapılmalı
    ratio=fit.explained_variance_ratio_ #toplam %70-80 civarı olması iyi
    component_coef=fit.components_      #componentları oluşturan katsayıları verir
    component_df=pca.fit_transform(x,y) #her satır için componentların hesaplanmış halini verir
    test_set_component_df=fit.transform(x_test)
    return ratio, component_coef, component_df, test_set_component_df

### Univariate Analysis Considering GINI Performance Measure

In [None]:
def Univariate_gini(X, y):
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import roc_auc_score
    r_square = []
    accuracy = []
    gini = []
    auc = []
    for column in X.columns:
        LogReg = LogisticRegression()
        variable = np.array(X[column]).reshape(-1, 1)
        LogReg.fit(variable, y)
        y_pred = LogReg.predict(variable)
        y_pred_prob = LogReg.predict_proba(variable)[:, 1]
        r_square_value = LogReg.score(np.array(X[column]).reshape(-1, 1), y)
        r_square.append(r_square_value)
        accuracy_score_value = accuracy_score(y, y_pred)
        accuracy.append(accuracy_score_value)
        auc_value = roc_auc_score(y, y_pred_prob)
        auc.append(auc_value)
        gini_value = 2 * auc_value - 1
        gini.append(gini_value)
    df_gini = pd.DataFrame(data=gini, index=X.columns, columns=["Gini Values"]).sort_values(by="Gini Values")
    return df_gini

### Find Optimal Depth of the Tree

In [None]:
def OptDepth_DecTree(x_train,y_train,var):
    from sklearn.model_selection import cross_val_score
    from sklearn.tree import DecisionTreeClassifier
    score_ls = []     # here I will store the roc auc
    score_std_ls = [] # here I will store the standard deviation of the roc_auc
    for tree_depth in [1,2,3,4]:
        tree_model = DecisionTreeClassifier(max_depth=tree_depth)
        scores = cross_val_score(tree_model,x_train.iloc[:,int(var)].to_frame(),       
        y_train, cv=3, scoring='roc_auc')   
        score_ls.append(np.mean(scores))
        score_std_ls.append(np.std(scores))  
    temp = pd.concat([pd.Series([1,2,3,4]), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
    temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']   
    a=np.where(temp.iloc[:,1]==temp.iloc[:,1].max()) #max auc'nin indexini verir
    s = int(a[0])
    #print(x_train.iloc[:,int(var)].name)
    #print(temp)
    return s+1 # gives depth value having max auc

### Categorizing variables by using Decision Tree

In [None]:
def Binnig_DecTree(x_train,y_train,x_test,opt_depth):
    from sklearn.tree import DecisionTreeClassifier
    c=len(x_train.columns)
    for j in range(c):       
        tree_model=DecisionTreeClassifier(max_depth=opt_depth[j], min_samples_leaf=int(len(x_train)*0.1))
        tree_model.fit(x_train.iloc[:,j].to_frame(), y_train)            
        x_train['TREE_'+str(x_train.iloc[:,j].name)]=tree_model.predict_proba(x_train.iloc[:,j].to_frame())[:,1] 

        bins=(pd.concat( [x_train.groupby (['TREE_'+str(x_train.iloc[:,j].name)]) [x_train.iloc [:,j].name].max()], axis=1))
        bins=np.sort(bins,0)  
        bins=[float(i) for i in bins] 
        del bins[len(bins)-1] 
        bins.insert(0,-100000000) 
        bins.append(100000000)    
        labels=[i+2 for i in range(len(bins)-1)]                  
        category_train = pd.cut(x_train.iloc [:,j],bins=bins,labels=labels) 
        category_train = category_train.to_frame()
        category_train.columns = ['BIN_'+x_train.iloc[:,j].name]       
        x_train = pd.concat([x_train,category_train],axis = 1)
                
        category_test = pd.cut(x_test.iloc [:,j],bins=bins,labels=labels) 
        category_test = category_test.to_frame()
        category_test.columns = ['BIN_'+x_test.iloc[:,j].name]       
        x_test = pd.concat([x_test,category_test],axis = 1)
    return x_train,x_test

### Cross-Validation with AUC (Stratified KFold) 

In [None]:
def print_skf_cv_auc(x,y,n_splits,classifiers):
    import warnings
    from sklearn.model_selection import StratifiedKFold
    from sklearn.metrics import roc_auc_score
    warnings.filterwarnings("ignore", category=FutureWarning)
    cv = StratifiedKFold(n_splits=n_splits)
    df_aucs = pd.DataFrame()

    for train_index, test_index in cv.split(x,y):
        x_train_skf, x_test_skf = x.iloc[train_index], x.iloc[test_index]
        y_train_skf, y_test_skf = y.iloc[train_index], y.iloc[test_index]
        aucs = []
        for clf in classifiers:
            #name=clf.__class__.__name__ 
            probas_ = clf.fit(x_train_skf, y_train_skf.values.ravel()).predict_proba(x_test_skf)
            roc_auc = roc_auc_score(y_test_skf, probas_[:, 1])
            aucs.append(roc_auc) 
    aucs = pd.Series(aucs)
    df_aucs=df_aucs.append(aucs,ignore_index=True)
    return df_aucs

### Cross-Validation by Train-Test-Initial Test Set AUC (Repeated KFold) 

In [None]:
def print_rkf_cv_auc(X,y,selected_cols,X_test,y_test,splits,repeats,classifiers):
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import RepeatedKFold
    rkf = RepeatedKFold(n_splits=splits, n_repeats=repeats, random_state=2652124)
    log_cols=["Classifier", "TRAIN_AUC", "TEST_AUC", "VAL_AUC"]
    log = pd.DataFrame(columns=log_cols)
    sample_id = 0
    for train_index, test_index in rkf.split(X):
    
        X_train_rkf, X_test_rkf = X.iloc[train_index], X.iloc[test_index]
        y_train_rkf, y_test_rkf = y.iloc[train_index], y.iloc[test_index]
    
        sample_id = sample_id + 1
        
        for clf in classifiers:
            
            rfe_model = clf.fit(X_train_rkf, y_train_rkf.values.ravel())
            name = clf.__class__.__name__
    
            auc_X_train_rkf = roc_auc_score(y_train_rkf, rfe_model.predict_proba(X_train_rkf)[:,1])
            auc_X_test_rkf = roc_auc_score(y_test_rkf, rfe_model.predict_proba(X_test_rkf)[:,1])
            auc_X_test = roc_auc_score(y_test, rfe_model.predict_proba(X_test.loc[:,selected_cols])[:,1])
    
            log = log.append({"Classifier": str(sample_id) + '-' + str(name), "TRAIN_AUC": auc_X_train_rkf, "TEST_AUC": auc_X_test_rkf, "VAL_AUC": auc_X_test }, ignore_index=True)
    
    log[['sample id','model name']] = log.Classifier.str.split("-",expand=True,)
    auc_means = log.groupby(['model name'])['TRAIN_AUC','TEST_AUC', 'VAL_AUC'].mean()
    auc_stds = log.groupby(['model name'])['TRAIN_AUC','TEST_AUC', 'VAL_AUC'].std()
    
    for clf in classifiers:
        name = clf.__class__.__name__
        print(name, "rkf AUC average: %0.3f (+/- 2std %0.3f)" % (auc_means.loc[name, 'TEST_AUC'], auc_stds.loc[name, 'TEST_AUC'] * 2))