In [1]:
import numpy as np
import pandas as pd
import _pickle as pickle
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('./data/data_clean1.csv.zip', compression='zip')

In [3]:
# present some info
print(data.info())
print(data.shape)
data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53801 entries, 0 to 53800
Columns: 4075 entries, 1 to pc_corr_652
dtypes: float64(4075)
memory usage: 1.6 GB
None
(53801, 4075)


Unnamed: 0,1,2,4,5,6,9,10,11,12,14,...,pc_corr_643,pc_corr_644,pc_corr_645,pc_corr_646,pc_corr_647,pc_corr_648,pc_corr_649,pc_corr_650,pc_corr_651,pc_corr_652
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00034,-0.000432,-0.000735,-0.001473,-0.004528,-0.001638,-0.000272,-0.001043,-0.000123,-0.000557
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00034,-0.000432,-0.000735,-0.001473,-0.004528,-0.001638,-0.000272,-0.001043,-0.000123,-0.000557
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00034,-0.000432,-0.000735,-0.001473,-0.004528,-0.001638,-0.000272,-0.001043,-0.000123,-0.000557
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00034,-0.000432,-0.000735,-0.001473,-0.004528,-0.001638,-0.000272,-0.001043,-0.000123,-0.000557
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.00034,-0.000432,-0.000735,-0.001473,-0.004528,-0.001638,-0.000272,-0.001043,-0.000123,-0.000557


In [4]:
# if a certain value takes more than 99% of the data then 
#  ( in my opinion) it doesn't contain much useful information,

n_rows = data.shape[0]
h_sk_cols = [] # highly skewed columns
for col in data.columns :
    val_counts = data[col].value_counts().tolist()
    if val_counts[0] / n_rows >= .9 :
        h_sk_cols.append(col)
#-------------------------------
print('highly skewed columns : ', len(h_sk_cols))
#-------------------------------
# there are 4615 skewed columns which is alot,
# instead of just deleting them, I'll use PCA dimension reduction to reduce them to 4 PCs.
#> first before doing that we need to make sure that the data is centered around 0 mean.
skewed_data = data[h_sk_cols].copy()
#scaler = MinMaxScaler()
#skewed_data = scaler.fit_transform(skewed_data)

pca = PCA(n_components=4)
new_data = pca.fit_transform(skewed_data)
print(pca.explained_variance_ratio_)
#--------------------------------


highly skewed columns :  4026
[0.01360958 0.00777679 0.00445035 0.00395292]


In [5]:
# convert the array into dataframe, set names to the columns
columns = ['pc_skew_%d' % i for i in range(new_data.shape[1])]
PCs_df = pd.DataFrame(data=new_data, columns=columns)
# drop the columns we used in PCA and then add the 2 dfs together
data.drop(columns=h_sk_cols, inplace=True)
data = pd.concat((data, PCs_df), axis=1)
# present some info
print(data.info())
print(data.shape)
data.head(5)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53801 entries, 0 to 53800
Data columns (total 53 columns):
878            53801 non-null float64
2755           53801 non-null float64
2835           53801 non-null float64
3169           53801 non-null float64
3664           53801 non-null float64
3866           53801 non-null float64
4243           53801 non-null float64
4581           53801 non-null float64
4696           53801 non-null float64
pc_corr_3      53801 non-null float64
pc_corr_11     53801 non-null float64
pc_corr_34     53801 non-null float64
pc_corr_75     53801 non-null float64
pc_corr_96     53801 non-null float64
pc_corr_102    53801 non-null float64
pc_corr_105    53801 non-null float64
pc_corr_109    53801 non-null float64
pc_corr_118    53801 non-null float64
pc_corr_152    53801 non-null float64
pc_corr_157    53801 non-null float64
pc_corr_159    53801 non-null float64
pc_corr_161    53801 non-null float64
pc_corr_164    53801 non-null float64
pc_corr_177    53

Unnamed: 0,878,2755,2835,3169,3664,3866,4243,4581,4696,pc_corr_3,...,pc_corr_608,pc_corr_611,pc_corr_628,pc_corr_632,pc_corr_633,pc_corr_647,pc_skew_0,pc_skew_1,pc_skew_2,pc_skew_3
0,0.002621,0.001748,0.0,0.0,0.013081,0.0,0.012828,0.0,0.0,-0.004978,...,-0.004116,0.073875,-0.000421,-0.002428,-0.004557,-0.004528,-0.003519,0.018676,0.036603,0.002824
1,0.00233,0.005243,0.0,0.0,0.015135,0.0,0.0,0.041352,0.0,0.004164,...,-0.004116,-0.005149,-0.00392,-0.002428,-0.004557,-0.004528,-0.016889,0.005014,-0.002502,0.000838
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.004978,...,-0.004116,-0.005149,-0.00392,-0.002428,-0.004557,-0.004528,-0.035685,0.002233,-0.009078,0.003172
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.004978,...,-0.004116,-0.005149,-0.00392,-0.002428,-0.004557,-0.004528,-0.036835,0.002459,-0.009231,0.003281
4,0.109695,0.0,0.012515,0.0,0.0,0.0,0.0,0.0,0.0,0.003333,...,-0.004116,-0.005149,-0.00392,-0.002428,-0.004557,-0.004528,-0.027983,0.009722,-0.007418,-0.001761


In [6]:
# perfect we finished :D
# save to disk
data.to_csv('./data/data_clean2.csv.zip',compression='zip' ,index=False)

In [7]:
train = data.iloc[:4459]
test = data.iloc[4459:]
train.to_csv('./data/final_train.csv.zip', compression='zip', index=False)
test.to_csv('./data/final_test.csv.zip', compression='zip', index=False)