In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

# load training data file and competition data file
# competition data is unseen data, for which we do not know the results (at all)

# 1 = mac, 2 = pc
platform = 1 
trainingDataFile = ""
competitionDataFile = ""

if platform == 1:
    trainingDataFile = "/Abby/Resources/MLData/santander-customer-transaction-prediction/train.csv"
    competitionDataFile = "/Abby/Resources/MLData/santander-customer-transaction-prediction/test.csv"

elif platform == 2:
    trainingDataFile = "E:\\Resources\\MLData\\Kaggle\\santander-customer-transaction-prediction\\train.csv"
    competitionDataFile = "E:\\Resources\\MLData\\Kaggle\\santander-customer-transaction-prediction\\test.csv"


currentDirectory = os.getcwd()
os.chdir(os.path.dirname(trainingDataFile))

dfTrainingData = pd.read_csv(os.path.basename(trainingDataFile))
dfCompetitionData = pd.read_csv(os.path.basename(competitionDataFile))

os.chdir(currentDirectory)

In [None]:
#
# OPTIONAL RUN STEP
#
#how many rows and columns?
print (dfTrainingData.shape)
print (dfCompetitionData.shape)

In [None]:
#
# OPTIONAL RUN STEP
#
# sample first 5 rows of training data
dfTrainingData.head()

In [None]:
#
# OPTIONAL RUN STEP
#
# sample first 5 rows of competition data
dfCompetitionData.head()

In [2]:
# select the index column
dfTrainingData = dfTrainingData.set_index('ID_code')
dfCompetitionData = dfCompetitionData.set_index('ID_code')

# Memory usage and optimization 

In [None]:
#
# OPTIONAL RUN STEP
#

# memory footprint
# memory_usage=deep ensures pandas does not provide a rough estimate of the memory usage.
dfTrainingData.info(verbose=1,max_cols=300,memory_usage='deep')

# how to read the results:
# there are 201 columns, 200,000 rows
# 200 rows are float64
# 1 row is int 64
# all 202 columns contain 200,000 values - no missing values anywhere.

In [None]:
#
# OPTIONAL RUN STEP
#
dfCompetitionData.info(verbose=1,max_cols=300,memory_usage='deep')

In [3]:
# seperate out the features and the target variable from dfTrainingData
features = ['var_' + str(i) for i in range(0, 200)]
dfTrainingFeatureVariables = dfTrainingData.loc[:, features]
dfTrainingTargetVariable = dfTrainingData.loc[:,['target']]

In [None]:
#
# OPTIONAL RUN STEP
#
for dtype in ['float','int','object']:
    selected_dtype = dfTrainingData.select_dtypes(include=[dtype])
    mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
    mean_usage_mb = mean_usage_b / 1024 ** 2
    print("Average memory usage for {} columns: {:03.2f} MB".format(dtype,mean_usage_mb))

# normalized and standardized copy of features 

In [4]:
# Use MinMax scaler to normalize all numeric attributes in dfTrainingFeatureVariables between 0 and 1
# note - all 200 feature variables in this case are numeric
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Fit on training set only.
scaler.fit(dfTrainingFeatureVariables.values)

# Apply transform to both the training set and the competiton set.
# output is NumPy ndarray
ndNormalizedTrainingFeatureVariables = scaler.transform(dfTrainingFeatureVariables.values)
ndNormalizedCompetitionData = scaler.transform(dfCompetitionData.values)

# create dataframes with normalized information
dfNormalizedTrainingFeatureVariables = pd.DataFrame(data=ndNormalizedTrainingFeatureVariables)
dfNormalizedTrainingFeatureVariables.columns = dfTrainingFeatureVariables.columns

dfNormalizedCompetitionData = pd.DataFrame(data=ndNormalizedCompetitionData)
dfNormalizedCompetitionData.columns = dfCompetitionData.columns

In [5]:
# Use RobustScaler scaler to normalize all numeric attributes in dfTrainingFeatureVariables between 0 and 1
# note - all 200 feature variables in this case are numeric
from sklearn.preprocessing import RobustScaler
robust_scaler = RobustScaler()

# Fit on training set only.
robust_scaler.fit(dfTrainingFeatureVariables.values)

# Apply transform to both the training set and the competiton set.
# output is NumPy ndarray
ndScaledTrainingFeatureVariables = robust_scaler.transform(dfTrainingFeatureVariables.values)
ndScaledCompetitionData = robust_scaler.transform(dfCompetitionData.values)

# create dataframes with normalized information
dfScaledTrainingFeatureVariables = pd.DataFrame(data=ndScaledTrainingFeatureVariables)
dfScaledTrainingFeatureVariables.columns = dfTrainingFeatureVariables.columns

dfScaledCompetitionData = pd.DataFrame(data=ndScaledCompetitionData)
dfScaledCompetitionData.columns = dfCompetitionData.columns

# Outlier detection, removal, and imputation - zScore (on Scaled)

In [68]:
from scipy import stats

def impute_zscore_outliers_with_median(df_input, z_score_threshold, copy):
    # calculare zscores
    zscores = np.abs(stats.zscore(df_input))
    
    # calculate median of each column
    median_values = df_input.median()
    
    # make a copy of the data frame if needed
    df_result = df_input
    if copy == True:
        df_result = df_input.copy(deep=True)
    
    # find the positions in the df where zscores are greater
    # than the threshold
    rows, cols = np.where(zscores > z_score_threshold)
    
    # replace outliera with median
    for index in range(0, rows.size):
        
        ypos = rows[index]
        xpos = cols[index]
        
        median = median_values[xpos]
        df_result.iloc[ypos, xpos] = median
    
    return df_result


def delete_zscore_outliers(df_input, z_score_threshold):
    # calculare zscores
    zscores = np.abs(stats.zscore(df_input))
    
    # acceptable_zscores_scaled is an array of booleans, of the same dimensions as zscores_scaled.
    acceptable_zscores = (zscores < z_score_threshold)

    # (zscores < N).all(axis=1) with return a one dimensional array, each element in this array 
    # will be true if all the elements in the corresponding row of (zscores < N) are true
    row_indexes = acceptable_zscores.all(axis=1)
    
    return df_input[row_indexes]

In [74]:
#from scipy import stats

# zscores_scaled is a 2d array of the same dimensions as dfScaledTrainingFeatureVariables
# each element in zscores_scaled is the zscore.
#zscores_scaled = np.abs(stats.zscore(dfScaledTrainingFeatureVariables))

# acceptable_zscores_scaled is an array of booleans, of the same dimensions as zscores_scaled.
#acceptable_zscores_scaled = (zscores_scaled < 3)

# (zscores_scaled < 3).all(axis=1) with return a one dimensional array, each element in this array 
# will be true if all the elements in the corresponding row of (zscores_scaled < 3) are true
#row_indexes_by_zscore = acceptable_zscores_scaled.all(axis=1)

#dfScaledTrainingFeatureVariables_WithoutOutliers = dfScaledTrainingFeatureVariables[row_indexes_by_zscore]

dfScaledTrainingFeatureVariables_WithoutOutliers = delete_zscore_outliers(dfScaledTrainingFeatureVariables, 3)
dfScaledTrainingFeaturesVariables_ImputedOutliers = impute_zscore_outliers_with_median(dfScaledTrainingFeatureVariables, 3, True)

# Outlier detection, removal, and imputation IQR (Normalized)

In [80]:
from scipy import stats

def impute_iqr_outliers_with_median(df_input, copy):
    # calculare IQR
    Q1 = df_input.quantile(0.25)
    Q3 = df_input.quantile(0.75)
    IQR = Q3 - Q1
    
    # compute min and max acceptable values for each variable
    MIN = (Q1 - 1.5 * IQR)
    MAX = (Q3 + 1.5 * IQR)
    
    # calculate median of each column
    median_values = df_input.median()
    
    # make a copy of the data frame if needed
    df_result = df_input
    if copy == True:
        df_result = df_input.copy(deep=True)
    
    # find the positions in the df where values are outside the IQR
    rows, cols = np.where((df_input < MIN) | (df_input > MAX))
    
    # replace outliera with median
    for index in range(0, rows.size):
        
        ypos = rows[index]
        xpos = cols[index]
        
        median = median_values[xpos]
        df_result.iloc[ypos, xpos] = median
    
    return df_result



def delete_iqr_outliers(df_input):
    # calculare IQR
    Q1 = df_input.quantile(0.25)
    Q3 = df_input.quantile(0.75)
    IQR = Q3 - Q1
    
    # compute min and max acceptable values for each variable
    MIN = (Q1 - 1.5 * IQR)
    MAX = (Q3 + 1.5 * IQR)

    return df_input[~((df_input < MIN) | (df_input > MAX)).any(axis=1)]

In [81]:
#Q1_NormalizedTrainingFeatureVariables = dfNormalizedTrainingFeatureVariables.quantile(0.25)
#Q3_NormalizedTrainingFeatureVariables = dfNormalizedTrainingFeatureVariables.quantile(0.75)
#IQR_NormalizedTrainingFeatureVariables = Q3_NormalizedTrainingFeatureVariables - Q1_NormalizedTrainingFeatureVariables

#MIN_NormalizedTrainingFeatureVariables = (Q1_NormalizedTrainingFeatureVariables - 1.5 * IQR_NormalizedTrainingFeatureVariables)
#MAX_NormalizedTrainingFeatureVariables = (Q3_NormalizedTrainingFeatureVariables + 1.5 * IQR_NormalizedTrainingFeatureVariables)

#dfNormalizedTrainingFeatureVariables_WithoutOutliers = dfNormalizedTrainingFeatureVariables[~((dfNormalizedTrainingFeatureVariables < MIN_NormalizedTrainingFeatureVariables) | (dfNormalizedTrainingFeatureVariables > MAX_NormalizedTrainingFeatureVariables)).any(axis=1)]

dfNormalizedTrainingFeatureVariables_WithoutOutliers = delete_iqr_outliers(dfNormalizedTrainingFeatureVariables)
dfNormalizedTrainingFeatureVariables_ImputedOutliers = impute_iqr_outliers_with_median(dfNormalizedTrainingFeatureVariables, True)

In [82]:
numRowsDropped = dfNormalizedTrainingFeatureVariables.shape[0] -  dfNormalizedTrainingFeatureVariables_WithoutOutliers.shape[0]
numRowsDropped

24896

In [83]:
numRowsDropped = dfNormalizedTrainingFeatureVariables.shape[0] -  dfNormalizedTrainingFeatureVariables_ImputedOutliers.shape[0]
numRowsDropped

0

In [None]:
from sklearn.datasets import load_iris
dataset = load_iris()

df_iris_features = pd.DataFrame(data = dataset.data, columns=dataset.feature_names)

# calculare zscores
zscores_iris = np.abs(stats.zscore(df_iris_features))

# delete all rows with zscores > 2
iris_data_without_outliers = df_iris_features[(zscores_iris < 2).all(axis=1)]

# impute all values with zscore > 2 with column median
df_iris_features_with_imputed_outliers = impute_outliers_with_median(df_iris_features, 2)
df_iris_features_with_imputed_outliers = df_iris_features.copy(deep=True)
median_values = df_iris_features.median()

rows, cols = np.where(zscores_iris > 2)
for index in range(0, rows.size):
        
    ypos = rows[index]
    xpos = cols[index]
        
    median = median_values[xpos]
    df_iris_features_with_imputed_outliers.iloc[ypos, xpos] = median

In [None]:
#
# OPTIONAL RUN STEP
#


# summary statistics of training data (withoout the target)
# note - unique, top, and frequency are only available for categorical data
dfSummary1 = dfNormalizedTrainingFeatureVariables.describe(include='all')

dfSummary1.head(11)

In [None]:
#
# OPTIONAL RUN STEP
#

# inspect characteristics of ndarray: dfNormalizedCompetitionData
dfNormalizedCompetitionData = pd.DataFrame(data=ndNormalizedCompetitionData)

# summary statistics of training data (withoout the target)
# note - unique, top, and frequency are only available for categorical data
dfSummary2 = dfNormalizedCompetitionData.describe(include='all')

dfSummary2.head(11)

In [None]:
=====================================================
==== PCA dimensionality reduction  ====
=====================================================

In [None]:
# 95 - 50%  PCA 
from sklearn.decomposition import PCA
pca95 = PCA(.95)
pca90 = PCA(.90)
pca85 = PCA(.85)
pca80 = PCA(.80)
pca75 = PCA(.75)
pca70 = PCA(.70)
pca65 = PCA(.65)
pca60 = PCA(.60)
pca55 = PCA(.55)
pca50 = PCA(.50)

ndPCA95_StandardizedTrainingFeatureVariables = pca95.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA90_StandardizedTrainingFeatureVariables = pca90.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA85_StandardizedTrainingFeatureVariables = pca85.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA80_StandardizedTrainingFeatureVariables = pca80.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA75_StandardizedTrainingFeatureVariables = pca75.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA70_StandardizedTrainingFeatureVariables = pca70.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA65_StandardizedTrainingFeatureVariables = pca65.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA60_StandardizedTrainingFeatureVariables = pca60.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA55_StandardizedTrainingFeatureVariables = pca55.fit_transform(ndStandardizedTrainingFeatureVariables)
ndPCA50_StandardizedTrainingFeatureVariables = pca50.fit_transform(ndStandardizedTrainingFeatureVariables)


ndPCA95_StandardizedCompetitionData = pca95.fit_transform(ndStandardizedCompetitionData)
ndPCA90_StandardizedCompetitionData = pca90.fit_transform(ndStandardizedCompetitionData)
ndPCA85_StandardizedCompetitionData = pca85.fit_transform(ndStandardizedCompetitionData)
ndPCA80_StandardizedCompetitionData = pca80.fit_transform(ndStandardizedCompetitionData)
ndPCA75_StandardizedCompetitionData = pca75.fit_transform(ndStandardizedCompetitionData)
ndPCA70_StandardizedCompetitionData = pca70.fit_transform(ndStandardizedCompetitionData)
ndPCA65_StandardizedCompetitionData = pca65.fit_transform(ndStandardizedCompetitionData)
ndPCA60_StandardizedCompetitionData = pca60.fit_transform(ndStandardizedCompetitionData)
ndPCA55_StandardizedCompetitionData = pca55.fit_transform(ndStandardizedCompetitionData)
ndPCA50_StandardizedCompetitionData = pca50.fit_transform(ndStandardizedCompetitionData)

In [None]:
print (ndPCA95_StandardizedTrainingFeatureVariables.shape)
print (ndPCA90_StandardizedTrainingFeatureVariables.shape)
print (ndPCA85_StandardizedTrainingFeatureVariables.shape)
print (ndPCA80_StandardizedTrainingFeatureVariables.shape)
print (ndPCA75_StandardizedTrainingFeatureVariables.shape)
print (ndPCA70_StandardizedTrainingFeatureVariables.shape)
print (ndPCA65_StandardizedTrainingFeatureVariables.shape)
print (ndPCA60_StandardizedTrainingFeatureVariables.shape)
print (ndPCA55_StandardizedTrainingFeatureVariables.shape)
print (ndPCA50_StandardizedTrainingFeatureVariables.shape)

In [None]:
print (ndPCA95_StandardizedCompetitionData.shape)
print (ndPCA90_StandardizedCompetitionData.shape)
print (ndPCA85_StandardizedCompetitionData.shape)
print (ndPCA80_StandardizedCompetitionData.shape)
print (ndPCA75_StandardizedCompetitionData.shape)
print (ndPCA70_StandardizedCompetitionData.shape)
print (ndPCA65_StandardizedCompetitionData.shape)
print (ndPCA60_StandardizedCompetitionData.shape)
print (ndPCA55_StandardizedCompetitionData.shape)
print (ndPCA50_StandardizedCompetitionData.shape)

In [None]:
# test train split on non-pca decomponsed dataset
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dfStandardizedTrainingData, dfTrainingTargetVariable, test_size=0.25, random_state=17)


In [None]:
# logistic regresion
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(x_train, y_train)

predictions = logisticRegr.predict(x_test)

# compute accuracy
#accuracy = logisticRegr.score(x_train, x_test)
#print (accuracy)

from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(x_test, predictions)
roc_auc = auc(fpr, tpr)
print(roc_auc)
#print(test_performance.auc())