In [1]:
# imports
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier


def read_csv_data(file_name, debug, server=False, num_rows=200):
    if server:
        path = '/home/science/data/'
        df = pd.read_hdf(path + file_name + '.h5', 'data')
    else:
        path = '/home/gublu/Desktop/THINKSTATS/Competition/data/'
        if debug:
            df = pd.read_csv(path + file_name + '.csv', nrows=num_rows)
        else:
            df = pd.read_csv(path + file_name + '.csv')
    for col in list(df):
        if str(df[col].dtype) == 'category':
            df[col] = df[col].astype('object')
    return df
def label_encode_it(df):
    encode_these_columns = []
    for col in list(df):
        if col == 'TARGET':
            continue
        if str(df[col].dtype) in ['object', 'category']:
            encode_these_columns.append(col)
            df[col] = df[col].astype('category').cat.codes
    print(encode_these_columns, '**********')
    return df
def min_max_scale_it(df):
    cols = [col for col in df.columns if col not in ['TARGET', 'SK_ID_CURR']]
    for col in cols:
        try:
            df[col]  = df[col].fillna(df[col].mean())
            df[col]=(df[col]-df[col].min())/(df[col].max()-df[col].min())
        except:
            pass
    return df

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

In [2]:
df = read_csv_data('shiv', debug=False)


In [None]:
# impute and scale
df = df.replace(-np.inf, np.nan)
df = df.replace(np.inf, np.nan)

cols = [col for col in df.columns if col not in ['TARGET', 'SK_ID_CURR']]
df  = min_max_scale_it(df)
df[cols] = label_encode_it(df[cols])

train = df[df['TARGET'].notnull()]
test = df[df['TARGET'].isnull()]

train  = train.fillna(df.mean())
test  = test.fillna(df.mean())

cols_to_drop = []
for col in list(df):
    if col == 'TARGET':
        continue
    train[col].replace(np.inf, np.nan, inplace=True)
    test[col].replace(np.inf, np.nan, inplace=True)
    train[col].replace(-np.inf, np.nan, inplace=True)
    test[col].replace(-np.inf, np.nan, inplace=True)
    if train[col].isnull().any():
        cols_to_drop.append(col)
    if test[col].isnull().any():
        cols_to_drop.append(col)
    cols_to_drop = list(set(cols_to_drop))

train.drop(cols_to_drop, axis=1, inplace=True)
test.drop(cols_to_drop, axis=1, inplace=True)
#print(cols_to_drop, 'cols_to_drop')
print(train.shape, test.shape)
train_dataset = train.values
X = train_dataset[:,2:]
y = train_dataset[:,1]
y=y.astype('int')
#print(X)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


['CODE_GENDER', 'NEW_LIVE_IND_SUM'] **********


In [4]:
train_dataset = train.values
X = train_dataset[:,2:]
y = train_dataset[:,1]
y=y.astype('int')
test_dataset = test.values
X_test = test_dataset[:,2:]
print(type(X_test))
print(X.shape, y.shape, X_test.shape)


In [5]:
pca = PCA(n_components=6)
principalComponents = pca.fit_transform(X)
principalDf  = pd.DataFrame(data = principalComponents, columns = ['pc 1', 'pc 2', 'pc 3', 'pc 4', 'pc 6', 'pc 6'])
tr = pd.concat([principalDf, train[['SK_ID_CURR']]], axis = 1)

In [6]:
#https://stats.stackexchange.com/questions/144439/applying-pca-to-test-data-for-classification-purposes
tr.head()

Unnamed: 0,pc 1,pc 2,pc 3,pc 4,pc 6,pc 6.1,SK_ID_CURR
0,1.038744,0.660044,-1.735642,0.882836,0.552001,1.418907,100002
1,0.651097,-2.851242,0.014057,-0.093043,0.004091,1.178198,100003
2,-0.329808,3.382052,-0.395318,-3.106345,-3.26612,-0.809542,100004
3,-0.695169,0.92338,1.469455,1.313932,-0.023042,0.018509,100006
4,-0.411651,0.345127,0.044824,-0.318583,1.040057,-2.464344,100007


In [7]:
# transform new data using already fitted pca
# (don't re-fit the pca)
X_test_transformed = pca.transform(X_test)


In [None]:
pca.explained_variance_ratio_

In [8]:
test_principalDf  = pd.DataFrame(data = X_test_transformed, columns = ['pc 1', 'pc 2', 'pc 3', 'pc 4', 'pc 6', 'pc 6'])
te = pd.concat([test_principalDf, test[['SK_ID_CURR']]], axis = 1)

In [15]:
X_test.shape
te.shape
#test[['SK_ID_CURR']].shape
test.head()


Unnamed: 0,SK_ID_CURR,TARGET,DAYS_BIRTH,NEW_IS_DAYS_EMPLOYED_365243,AMT_CREDIT,DAYS_ID_PUBLISH,DAYS_REGISTRATION,REGION_POPULATION_RELATIVE,CODE_GENDER,AMT_INCOME_TOTAL,...,EXT_SOURCES_sum,NEWLY_EMPLOYED,YOUNG_AGED,NEW_DOC_IND_KURT,NEW_NUMBER_OF_DOCUMENTS_SUBMITTED,NEW_LIVE_IND_SUM,NEW_CONTACT_IND_SUM,NEW_CONTACT_IND_KURT,NEW_REG_IND_SUM,NEW_REG_IND_KURT
307511,100001,0.080729,-0.73409,0.0,-0.047582,1.44283,-0.052851,-0.149312,-0.717679,-0.157114,...,1.171981,1.358378,0.759673,0.342151,0.192768,-0.354006,0.728298,-0.063913,-0.546777,0.430763
307512,100005,0.080729,-0.464062,0.0,-0.91565,0.908539,-1.172793,1.069445,1.393241,-0.318183,...,0.291719,1.358378,0.759673,0.342151,0.192768,-0.354006,-0.4215,-0.569196,-0.546777,0.430763
307513,100013,0.080729,-0.916939,0.0,0.189393,-0.330014,0.796037,-0.130628,1.393241,0.14489,...,0.338724,1.358378,0.759673,0.342151,0.192768,1.545982,-0.4215,-0.569196,-0.546777,0.430763
307514,100028,0.080729,0.473812,0.0,2.476606,-0.794472,0.845938,0.393955,-0.717679,0.648231,...,1.058744,-0.736172,-1.316356,0.342151,0.192768,-0.354006,0.728298,-0.063913,-0.546777,0.430763
307515,100038,0.080729,0.68855,0.0,0.094657,-0.830048,0.278879,-0.782404,1.393241,0.044222,...,-1.120545,1.358378,-1.316356,0.342151,0.192768,0.595988,0.728298,-0.063913,1.292897,-1.425466
