In [1]:
import os
import pandas as pd
import numpy as np
import pickle

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
Xtrain = pickle.load(open('./Xtrain.data', 'rb'))

In [4]:
Xtest = pickle.load(open('./Xtest.data', 'rb'))

In [5]:
Ytrain = pickle.load(open('./Ytrain.data', 'rb'))

In [6]:
display(Ytrain)

TransactionID
2987000    0
2987001    0
2987002    0
2987003    0
2987004    0
          ..
3133511    1
3471069    1
3272146    1
3091445    1
3038603    1
Name: isFraud, Length: 1139754, dtype: int8

In [7]:
Xtrainindex = Xtrain.index
Xtestindex = Xtest.index

In [8]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [9]:
pca = PCA(n_components=2)
lda = LinearDiscriminantAnalysis(n_components=2)

In [10]:
pca.fit(Xtrain)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [11]:
lda.fit(Xtrain, Ytrain)



LinearDiscriminantAnalysis(n_components=2, priors=None, shrinkage=None,
                           solver='svd', store_covariance=False, tol=0.0001)

In [14]:
Xtrain_pca = pca.transform(Xtrain)
Xtest_pca = pca.transform(Xtest)
Xtrain_lda = lda.transform(Xtrain)
Xtest_lda = lda.transform(Xtest)
display(Xtrain_pca.shape)
display(Xtest_pca.shape)
display(Xtrain_lda.shape)
display(Xtest_lda.shape)

(1139754, 2)

(506691, 2)

(1139754, 1)

(506691, 1)

In [15]:
Xtrain_pca = reduce_mem_usage(pd.DataFrame(data=Xtrain_pca, index=Xtrainindex))
Xtest_pca = reduce_mem_usage(pd.DataFrame(data=Xtest_pca, index=Xtestindex))
Xtrain_lda = reduce_mem_usage(pd.DataFrame(data=Xtrain_lda, index=Xtrainindex))
Xtest_lda = reduce_mem_usage(pd.DataFrame(data=Xtrain_lda, index=Xtrainindex))
pickle.dump(Xtrain_pca, open('./Xtrain_pca_2.data', 'wb'))
pickle.dump(Xtest_pca, open('./Xtest_pca_2.data', 'wb'))
pickle.dump(Xtrain_lda, open('./Xtrain_lda_2.data', 'wb'))
pickle.dump(Xtest_lda, open('./Xtest_lda_2.data', 'wb'))

Mem. usage decreased to 13.04 Mb (50.0% reduction)
Mem. usage decreased to  5.80 Mb (50.0% reduction)
Mem. usage decreased to 10.87 Mb (37.5% reduction)
Mem. usage decreased to 10.87 Mb (0.0% reduction)
