In [21]:
#!/usr/bin/python

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

%matplotlib inline

#note: all functions modify input frame

#this one might not actually be that useful 
def drop_duplicate_cols(training, test) :
    print("Removing duplicate columns")
    remove = []
    cols = training.columns
    for i in range(len(cols)-1):
        v = training[cols[i]].values
        for j in range(i+1,len(cols)):
            if np.array_equal(v,training[cols[j]].values):
                remove.append(cols[j])
    training = training.drop(remove, axis=1)
    test = test.drop(remove, axis=1)
    return training, test


def replace_var3_with_mean(training, test):
    print("Replacing missing var3 country value with mean")
    training = training.replace(-999999,2)
    test = test.replace(-999999,2)
    return training, test

def one_hot_encode_countries(training, test):
    print("One hot encoding countries")
    train = pd.get_dummies(training, columns = ['var3'] )
    test = pd.get_dummies(test, columns = ['var3'])

    # get the columns in train that are not in test
    col_to_add = np.setdiff1d(train.columns, test.columns)
    print col_to_add.shape
    col_to_add2 = np.setdiff1d(test.columns, train.columns)
    print col_to_add.shape
    # add these columns to test, setting them equal to zero
    for c in col_to_add:
        test[c] = 0

    # select and reorder the test columns using the train columns
    test = test[train.columns]
    return train, test
    
def add_feature_for_sum_of_zeros(training, test):
    print("Adding a feature for the sum of zeros")
    X = training.iloc[:,:-1]
    Xt = test.iloc[:,:-1]
    X['n0'] = (X==0).sum(axis=1)
    Xt['n0'] = (Xt==0).sum(axis=1)
    training['n0'] = X['n0']
    test['n0'] = Xt['n0']
    return training, test
    

def log_transform_var38_and_split_into_two_features(training, test):
    print("Log transforming var38 and splitting var38 into two features")
    training['var38ismode'] = np.isclose(training.var38, 117310.979016)
    training['logvar38'] = training.loc[~training['var38ismode'], 'var38'].map(np.log)
    training.loc[training['var38ismode'], 'logvar38'] = 0

    test['var38ismode'] = np.isclose(test.var38, 117310.979016)
    test['logvar38'] = test.loc[~test['var38ismode'], 'var38'].map(np.log)
    test.loc[test['var38ismode'], 'logvar38'] = 0

    return training, test
    
def add_top_5_principal_components(training, test):
    print("Adding top 5 principal components")
    pca = PCA(n_components=5)
    training_copy = training.drop(['TARGET'], axis=1)
    features = training_copy.columns
    pca_training = pca.fit_transform(normalize(training[features], axis=0))
    pca_test = pca.transform(normalize(test[features], axis=0))
    training['PCA_0'] = pca_training[:,0]
    training['PCA_1'] = pca_training[:,1]
    training['PCA_2'] = pca_training[:,2]
    training['PCA_3'] = pca_training[:,3]
    training['PCA_4'] = pca_training[:,4]
    test['PCA_0'] = pca_test[:,0]
    test['PCA_1'] = pca_test[:,1]
    test['PCA_2'] = pca_test[:,2]
    test['PCA_3'] = pca_test[:,3]
    test['PCA_4'] = pca_test[:,4]
    
    return training, test

def remove_low_variance_features(training, test, threshold):
    print("Removing features with variance less than %.3f" % threshold)
    remove = []
    for col in training.columns:
        if training[col].std() <= threshold:
            remove.append(col)
    dontremove = ['TARGET', 'var38ismode', 'var3']
    for elem in dontremove:
        if elem in remove: remove.remove(elem)
            
    #print remove
    print("the features removed were", remove)
    training = training.drop(remove, axis=1)
    test = test.drop(remove, axis=1)
    return training, test

def plot_low_variance_features(training, test):
    stds = []
    feature = []
    for col in training.columns:
        feature.append(col)
        stds.append(training[col].std() ** 2)
    
    plt.scatter(stds, [0] * len(stds))
    
def drop_ID(training, test):
    print("Dropping ID as feature")
    training = training.drop(['ID'], axis=1)
    test = test.drop(['ID'], axis=1)
    return training, test

def standardize_data(training, test):
    print("Standardizing features")
    features = []
    for col in training.columns:
        features.append(col)
        
    dontremove = ['TARGET', 'var38ismode', 'var3']
    for elem in dontremove:
        if elem in features: features.remove(elem)
    
    ss = StandardScaler()
    training[features] = np.round(ss.fit_transform(training[features]), 6)
    test[features] = np.round(ss.transform(test[features]), 6)
    return training, test

def add_k_means_cluster_as_feature(training, test):
    return training, test
    

def main(standardize=False, threshold = 1.0, dropID=True):
    training = pd.read_csv('train.csv')
    test = pd.read_csv('test.csv')
    training, test = drop_duplicate_cols(training, test)
    if dropID:
        training, test = drop_ID(training, test)
    training, test = replace_var3_with_mean(training, test)
    training, test = add_feature_for_sum_of_zeros(training, test)
    training, test = log_transform_var38_and_split_into_two_features(training, test)
    training, test = remove_low_variance_features(training, test, threshold)
    training, test = add_top_5_principal_components(training, test)
    
    if standardize:
        training, test = standardize_data(training, test)
    training, test = one_hot_encode_countries(training, test)
    return training, test
    
    
if __name__ == "__main__":
    training, test = main(False)


Removing duplicate columns
Dropping ID as feature
Replacing missing var3 country value with mean
Adding a feature for the sum of zeros
Log transforming var38 and splitting var38 into two features
Removing features with variance less than 1.000
('the features removed were', ['ind_var1_0', 'ind_var1', 'ind_var2_0', 'ind_var5_0', 'ind_var5', 'ind_var6_0', 'ind_var6', 'ind_var8_0', 'ind_var8', 'ind_var12_0', 'ind_var12', 'ind_var13_0', 'ind_var13_corto_0', 'ind_var13_corto', 'ind_var13_largo_0', 'ind_var13_largo', 'ind_var13_medio_0', 'ind_var13', 'ind_var14_0', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var18_0', 'ind_var19', 'ind_var20_0', 'ind_var20', 'ind_var24_0', 'ind_var24', 'ind_var25_cte', 'ind_var26_0', 'ind_var26_cte', 'ind_var25_0', 'ind_var30_0', 'ind_var30', 'ind_var31_0', 'ind_var31', 'ind_var32_cte', 'ind_var32_0', 'ind_var33_0', 'ind_var33', 'ind_var34_0', 'ind_var37_cte', 'ind_var37_0', 'ind_var39_0', 'ind_var40_0', 'ind_var40', 'ind_var41_0', 'ind_var44_0', 'ind_var44