In [1]:
import os
import pandas as pd
import numpy as np
import pickle

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
files = ['../input/ieee-fraud-detection/test_identity.csv', 
         '../input/ieee-fraud-detection/test_transaction.csv',
         '../input/ieee-fraud-detection/train_identity.csv',
         '../input/ieee-fraud-detection/train_transaction.csv',
         '../input/ieee-fraud-detection/sample_submission.csv']

def load_data(file):
    return reduce_mem_usage(pd.read_csv(file))

In [4]:
test_identity = load_data(files[0])
test_transaction = load_data(files[1])
train_identity = load_data(files[2])
train_transaction = load_data(files[3])

Mem. usage decreased to 25.44 Mb (42.7% reduction)
Mem. usage decreased to 472.59 Mb (68.9% reduction)
Mem. usage decreased to 25.86 Mb (42.7% reduction)
Mem. usage decreased to 542.35 Mb (69.4% reduction)


In [5]:
test_identity = test_identity.set_index(['TransactionID'])
test_transaction = test_transaction.set_index(['TransactionID'])
train_identity = train_identity.set_index(['TransactionID'])
train_transaction = train_transaction.set_index(['TransactionID'])
print(train_identity.shape)
print(train_transaction.shape)
print(test_identity.shape)
print(test_transaction.shape)

(144233, 40)
(590540, 393)
(141907, 40)
(506691, 392)


In [6]:
def check_null_values(df):
    for column in df.columns:
        print('Column Name:', column, '\nHas null values:',df[column].isna().any(), '\nAmount of null values:', df[column].isna().sum())
        print('Values - Count')
        print(df[column].value_counts())
        print()

In [7]:
def replace_null_values(df):
    for column in df.columns:
        if(df[column].isna().any()):
            print(column, df[column].isna().sum())
            def replace():
                global i
                values = df[column].value_counts().index
                counts = df[column].value_counts().values
                counts = counts/counts.sum()
                yield np.random.choice(values,replace=False, p=counts)
            new_value = replace()
            df[column].fillna(value=next(new_value), inplace=True)
    for column in df.columns:
        print(column, df[column].isna().sum())
        if(df[column].isna().any()):
            print('Column:', column, 'has null values')
        else:
            print('No Null values in column')

In [8]:
df_train = train_transaction.join(train_identity, on='TransactionID',how='left', sort=True)
df_test = test_transaction.join(test_identity, on='TransactionID', how='left', sort=True)

In [9]:
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 432)


In [10]:
del(train_identity)
del(train_transaction)
del(test_identity)
del(test_transaction)

In [11]:
df_train['isFraud'].fillna(value=0.0, inplace=True)
df_test['isFraud'] = -1

In [12]:
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 433)


In [13]:
df_X = pd.concat([df_train, df_test], axis=0)
print(df_X.shape)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


(1097231, 433)


In [14]:
df_X['id_30'].unique()

array([nan, 'Android 7.0', 'iOS 11.1.2', 'Mac OS X 10_11_6', 'Windows 10',
       'Android', 'Linux', 'iOS 11.0.3', 'Mac OS X 10_7_5',
       'Mac OS X 10_12_6', 'Mac OS X 10_13_1', 'iOS 11.1.0',
       'Mac OS X 10_9_5', 'Windows 7', 'Windows 8.1', 'Mac', 'iOS 10.3.3',
       'Mac OS X 10.12', 'Mac OS X 10_10_5', 'Mac OS X 10_11_5',
       'iOS 9.3.5', 'Android 5.1.1', 'Android 7.1.1', 'Android 6.0',
       'iOS 10.3.1', 'Mac OS X 10.9', 'iOS 11.1.1', 'Windows Vista',
       'iOS 10.3.2', 'iOS 11.0.2', 'Mac OS X 10.11', 'Android 8.0.0',
       'iOS 10.2.0', 'iOS 10.2.1', 'iOS 11.0.0', 'Mac OS X 10.10',
       'Mac OS X 10_12_3', 'Mac OS X 10_12', 'Android 6.0.1', 'iOS',
       'Mac OS X 10.13', 'Mac OS X 10_12_5', 'Mac OS X 10_8_5',
       'iOS 11.0.1', 'iOS 10.0.2', 'Android 5.0.2', 'Windows XP',
       'iOS 11.2.0', 'Mac OS X 10.6', 'Windows 8', 'Mac OS X 10_6_8',
       'Mac OS X 10_11_4', 'Mac OS X 10_12_1', 'iOS 10.1.1',
       'Mac OS X 10_11_3', 'Mac OS X 10_12_4', 'Mac OS X 10

In [15]:
df_X['id_30'].replace(to_replace=r'(^Windows).*$', value='Windows', regex=True, inplace=True)
df_X['id_30'].replace(to_replace=r'(^Android).*$', value='Android', regex=True, inplace=True)
df_X['id_30'].replace(to_replace=r'(^iOS).*$', value='iOS', regex=True, inplace=True)
df_X['id_30'].replace(to_replace=r'(^Mac).*$', value='Mac', regex=True, inplace=True)
df_X['id_30'].unique()

array([nan, 'Android', 'iOS', 'Mac', 'Windows', 'Linux', 'func', 'other'],
      dtype=object)

In [16]:
df_X['id_31'].unique()

array([nan, 'samsung browser 6.2', 'mobile safari 11.0', 'chrome 62.0',
       'chrome 62.0 for android', 'edge 15.0', 'mobile safari generic',
       'chrome 49.0', 'chrome 61.0', 'edge 16.0', 'safari generic',
       'edge 14.0', 'chrome 56.0 for android', 'firefox 57.0',
       'chrome 54.0 for android', 'mobile safari uiwebview', 'chrome',
       'chrome 62.0 for ios', 'firefox', 'chrome 60.0 for android',
       'mobile safari 10.0', 'chrome 61.0 for android',
       'ie 11.0 for desktop', 'ie 11.0 for tablet', 'mobile safari 9.0',
       'chrome generic', 'other', 'chrome 59.0 for android',
       'firefox 56.0', 'android webview 4.0', 'chrome 55.0', 'opera 49.0',
       'ie', 'chrome 55.0 for android', 'firefox 52.0',
       'chrome 57.0 for android', 'chrome 56.0',
       'chrome 46.0 for android', 'chrome 58.0', 'firefox 48.0',
       'chrome 59.0', 'samsung browser 4.0', 'edge 13.0',
       'chrome 53.0 for android', 'chrome 58.0 for android',
       'chrome 60.0', 'mobile sa

In [17]:
df_X['id_31'].replace(to_replace=r'^(chrome).*$', value='chrome', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^([s|S]amsung).*$', value='samsung', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(safari).*$', value='safari', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^.*\s(safari).*$', value='safari', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(edge).*$', value='edge', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(ie).*$', value='ie', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(firefox).*$', value='firefox', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(Mozilla).*$', value='firefox', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(android).*$', value='android', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(opera).*$', value='opera', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(Generic).*$', value='android', regex=True, inplace=True)
df_X['id_31'].replace(to_replace=r'^(google).*$', value='google', regex=True, inplace=True)
df_X['id_31'].unique()

array([nan, 'samsung', 'safari', 'chrome', 'edge', 'firefox', 'ie',
       'other', 'android', 'opera', 'mobile', 'aol', 'silk', 'waterfox',
       'Nokia/Lumia', 'puffin', 'Microsoft/Windows', 'cyberfox',
       'ZTE/Blade', 'palemoon', 'maxthon', 'line', 'LG/K-200', 'iron',
       'BLU/Dash', 'seamonkey', 'M4Tel/M4', 'comodo', 'Lanix/Ilium',
       'chromium', 'Inco/Minion', 'Cherry', 'icedragon', 'google',
       'facebook', 'rim', 'uc', 'blackberry'], dtype=object)

In [18]:
df_X['id_33'].unique()

array([nan, '2220x1080', '1334x750', '1280x800', '1366x768', '1920x1080',
       '1680x1050', '1136x640', '5120x2880', '2880x1800', '1920x1200',
       '2560x1600', '2048x1536', '1024x768', '1280x720', '2560x1440',
       '2208x1242', '2001x1125', '1440x900', '1600x900', '2672x1440',
       '1280x1024', '960x540', '2732x2048', '2436x1125', '2048x1152',
       '2960x1440', '1024x600', '855x480', '4096x2304', '2160x1440',
       '2562x1442', '801x480', '2736x1824', '3441x1440', '2880x1620',
       '3840x2160', '1638x922', '1280x768', '1360x768', '1280x960',
       '3440x1440', '1152x720', '1280x1025', '3360x2100', '2304x1296',
       '1152x864', '3200x1800', '2112x1188', '2224x1668', '2400x1350',
       '2000x1125', '1600x1000', '2560x1080', '1728x972', '3000x2000',
       '1024x640', '3840x2400', '2304x1440', '1280x600', '1400x1050',
       '1600x1200', '3201x1800', '1356x900', '1344x756', '1624x1080',
       '1536x864', '1800x1125', '1920x1281', '2961x1442', '1366x1024',
       '1344x8

In [19]:
def mod(x):
    if type(x) == type(''):
        x = x.split(sep='x')
        x = int(x[0])*int(x[1])
        return x
    else:
        return np.nan

In [20]:
df_X['id_33'] = df_X['id_33'].map(mod)
df_X['id_33'].value_counts()

2073600.0     33742
1049088.0     15046
1000500.0     11550
2742336.0      9113
1296000.0      8127
              ...  
4986816.0         1
1329505.0         1
4855501.0         1
33177600.0        1
9222241.0         1
Name: id_33, Length: 455, dtype: int64

In [21]:
df_X['P_emaildomain'].replace(to_replace=r'(\.\w+)+$', value='', regex=True, inplace=True)
df_X['P_emaildomain'].unique()

array([nan, 'gmail', 'outlook', 'yahoo', 'mail', 'anonymous', 'hotmail',
       'verizon', 'aol', 'me', 'comcast', 'optonline', 'cox', 'charter',
       'rocketmail', 'prodigy', 'embarqmail', 'icloud', 'live', 'att',
       'juno', 'ymail', 'sbcglobal', 'bellsouth', 'msn', 'q',
       'centurylink', 'servicios-ta', 'earthlink', 'cfl', 'roadrunner',
       'netzero', 'gmx', 'suddenlink', 'frontiernet', 'windstream',
       'frontier', 'mac', 'aim', 'web', 'twc', 'cableone', 'sc', 'ptd',
       'protonmail', 'scranton'], dtype=object)

In [22]:
df_X['R_emaildomain'].replace(to_replace=r'(\.\w+)+$', value='', regex=True, inplace=True)
df_X['R_emaildomain'].unique()

array([nan, 'gmail', 'hotmail', 'outlook', 'anonymous', 'charter',
       'prodigy', 'comcast', 'live', 'icloud', 'yahoo', 'aol', 'juno',
       'att', 'verizon', 'bellsouth', 'servicios-ta', 'ymail', 'msn',
       'optonline', 'gmx', 'aim', 'mail', 'me', 'cox', 'earthlink',
       'embarqmail', 'web', 'sbcglobal', 'scranton', 'mac', 'twc',
       'roadrunner', 'frontiernet', 'q', 'windstream', 'suddenlink',
       'ptd', 'frontier', 'cfl', 'netzero', 'rocketmail', 'centurylink',
       'protonmail', 'cableone', 'sc'], dtype=object)

In [23]:
df_X['DeviceInfo'].value_counts()

Windows           92710
iOS Device        38502
MacOS             23722
Trident/7.0       12330
rv:11.0            2650
                  ...  
SM-G531F              1
Azumi_IRO_A5_Q        1
LG-US998              1
G25524K               1
HT0703K16             1
Name: DeviceInfo, Length: 2799, dtype: int64

In [24]:
df_X.drop(columns=['DeviceInfo'], inplace=True)

In [25]:
check_null_values(df_X)

Column Name: C1 
Has null values: True 
Amount of null values: 3
Values - Count
1.0       586742
2.0       197520
3.0        96569
4.0        53794
5.0        32851
           ...  
1393.0         1
3734.0         1
1389.0         1
1388.0         1
1237.0         1
Name: C1, Length: 1964, dtype: int64

Column Name: C10 
Has null values: True 
Amount of null values: 3
Values - Count
0.0       824420
1.0       192574
2.0        32690
3.0        13286
4.0         6838
           ...  
1205.0         1
1203.0         1
1886.0         1
279.0          1
1839.0         1
Name: C10, Length: 1278, dtype: int64

Column Name: C11 
Has null values: True 
Amount of null values: 3
Values - Count
1.0       731792
2.0       168780
3.0        61642
4.0        28478
5.0        16366
           ...  
1003.0         1
1000.0         1
999.0          1
998.0          1
511.0          1
Name: C11, Length: 1677, dtype: int64

Column Name: C12 
Has null values: True 
Amount of null values: 3
Values - Count


In [26]:
replace_null_values(df_X)

C1 3
C10 3
C11 3
C12 3
C13 4748
C14 3
C2 3
C3 3
C4 3
C5 3
C6 3
C7 3
C8 3
C9 3
D1 7300
D10 88567
D11 455805
D12 963260
D13 911895
D14 919850
D15 101182
D2 515566
D3 466020
D4 245773
D5 534216
D6 899261
D7 998181
D8 947967
D9 947967
DeviceType 819490
M1 447739
M2 447739
M3 447739
M4 519189
M5 660114
M6 328299
M7 581283
M8 581256
M9 581256
P_emaildomain 163648
R_emaildomain 824070
V1 455805
V10 455805
V100 314
V101 314
V102 314
V103 314
V104 314
V105 314
V106 314
V107 314
V108 314
V109 314
V11 455805
V110 314
V111 314
V112 314
V113 314
V114 314
V115 314
V116 314
V117 314
V118 314
V119 314
V12 88662
V120 314
V121 314
V122 314
V123 314
V124 314
V125 314
V126 314
V127 314
V128 314
V129 314
V13 88662
V130 314
V131 314
V132 314
V133 314
V134 314
V135 314
V136 314
V137 314
V138 939501
V139 939501
V14 88662
V140 939501
V141 939501
V142 939501
V143 939225
V144 939225
V145 939225
V146 939501
V147 939501
V148 939501
V149 939501
V15 88662
V150 939225
V151 939225
V152 939225
V153 939501
V154 939501
V

In [27]:
not_fraud = df_X[df_X['isFraud'] == 0]
fraud = df_X[df_X['isFraud'] == 1]
test = df_X[df_X['isFraud'] == -1]
train = df_X[df_X['isFraud'] != -1]
pickle.dump(train, open('./train.data', 'wb'))
pickle.dump(test, open('./test.data', 'wb'))
print(not_fraud.shape)
print(fraud.shape)

(569877, 432)
(20663, 432)


In [28]:
from sklearn.utils import resample
fraud_upsampled = resample(fraud,
                          replace=True,
                          n_samples=len(not_fraud),
                          )
print(fraud_upsampled.shape)

(569877, 432)


In [29]:
fraud_upsampled.head()

Unnamed: 0_level_0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3221734,10.0,6.0,5.0,5.0,1.0,1.0,20.0,0.0,6.0,0.0,...,Android,safari,32.0,2073600.0,match_status:2,F,F,F,F,1
3108832,2.0,0.0,1.0,0.0,2.0,2.0,1.0,0.0,0.0,0.0,...,Android,chrome,32.0,2073600.0,match_status:2,F,F,T,F,1
3288087,1.0,0.0,1.0,0.0,7.0,1.0,1.0,0.0,0.0,1.0,...,Android,chrome,32.0,2073600.0,match_status:2,F,F,T,F,1
3073514,1.0,1.0,1.0,0.0,1.0,1.0,2.0,0.0,2.0,0.0,...,Android,other,32.0,410400.0,match_status:2,T,F,T,F,1
3140407,17.0,7.0,4.0,4.0,2.0,2.0,34.0,0.0,4.0,0.0,...,Android,chrome,32.0,2073600.0,match_status:2,F,F,T,T,1


In [30]:
df_X = pd.concat([not_fraud, fraud_upsampled, test])
df_X['isFraud'].value_counts()

 1    569877
 0    569877
-1    506691
Name: isFraud, dtype: int64

In [31]:
del(fraud)
del(not_fraud)
del(test)

In [32]:
cat_cols = []
for i in range(1,7):
    cat_cols.append('card'+str(i))
for i in range(1,3):
    cat_cols.append('addr'+str(i))
for i in range(1, 10):
    cat_cols.append('M'+str(i))
for i in range(12, 39):
    cat_cols.append('id_'+str(i))
cat_cols.append('ProductCD')
cat_cols.append('DeviceType')
cat_cols.append('P_emaildomain') 
cat_cols.append('R_emaildomain')
display(cat_cols)

['card1',
 'card2',
 'card3',
 'card4',
 'card5',
 'card6',
 'addr1',
 'addr2',
 'M1',
 'M2',
 'M3',
 'M4',
 'M5',
 'M6',
 'M7',
 'M8',
 'M9',
 'id_12',
 'id_13',
 'id_14',
 'id_15',
 'id_16',
 'id_17',
 'id_18',
 'id_19',
 'id_20',
 'id_21',
 'id_22',
 'id_23',
 'id_24',
 'id_25',
 'id_26',
 'id_27',
 'id_28',
 'id_29',
 'id_30',
 'id_31',
 'id_32',
 'id_33',
 'id_34',
 'id_35',
 'id_36',
 'id_37',
 'id_38',
 'ProductCD',
 'DeviceType',
 'P_emaildomain',
 'R_emaildomain']

In [34]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cat_cols:
    df_X[col] = le.fit_transform(df_X[col])
print(df_X.shape)

(1646445, 432)


In [35]:
df_X.head()

Unnamed: 0_level_0,C1,C10,C11,C12,C13,C14,C2,C3,C4,C5,...,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,isFraud
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2987000,1.0,0.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,12,4,227,3,0,0,1,0,0
2987001,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,12,4,227,3,0,0,1,0,0
2987002,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,12,4,227,3,0,0,1,0,0
2987003,2.0,0.0,1.0,0.0,25.0,1.0,5.0,0.0,0.0,0.0,...,0,12,4,227,3,0,0,1,0,0
2987004,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0,32,4,256,3,1,0,1,1,0


In [36]:
pickle.dump(cat_cols, open('cat_cols.data', 'wb'))
del(cat_cols)

In [37]:
df_train = df_X[df_X['isFraud'] != -1]
df_test = df_X[df_X['isFraud'] == -1]
del(df_X)

In [39]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)
pickle.dump(df_train, open('df_train.data', 'wb'))
pickle.dump(df_test, open('df_test.data', 'wb'))
print(df_train.shape)
print(df_test.shape)

Mem. usage decreased to 1005.43 Mb (0.0% reduction)
Mem. usage decreased to 454.71 Mb (0.0% reduction)
(1139754, 432)
(506691, 432)


In [40]:
Ytrain = df_train['isFraud']
Xtrain = df_train.drop(columns=['isFraud'])
Xtest = df_test.drop(columns=['isFraud'])
print(Xtrain.shape)
print(Ytrain.shape)
print(Xtest.shape)
pickle.dump(Ytrain, open('./Ytrain.data', 'wb'))

(1139754, 431)
(1139754,)
(506691, 431)


In [41]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [42]:
scaler.fit(Xtrain)
pickle.dump(scaler, open('scaler.data', 'wb'))
Xtraincols = Xtrain.columns
Xtrainindex = Xtrain.index
Xtestcols = Xtest.columns
Xtestindex = Xtest.index

In [43]:
Xtrain = pd.DataFrame(data=scaler.transform(Xtrain), index=Xtrainindex, columns=Xtraincols)

In [44]:
Xtest = pd.DataFrame(data=scaler.transform(Xtest), index=Xtestindex, columns=Xtestcols)

In [45]:
for column in Xtrain.columns:
    print(Xtrain[column].dtype)
for column in Xtest.columns:
    print(Xtest[column].dtype)

float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64
float64


In [46]:
Xtrain = reduce_mem_usage(Xtrain)

Mem. usage decreased to 945.65 Mb (74.8% reduction)


In [47]:
Xtest = reduce_mem_usage(Xtest)

Mem. usage decreased to 420.40 Mb (74.8% reduction)


In [48]:
for column in Xtrain.columns:
    print(Xtrain[column].dtype)
for column in Xtest.columns:
    print(Xtest[column].dtype)

float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16
float16


In [49]:
pickle.dump(Xtrain, open('./Xtrain.data', 'wb'))
pickle.dump(Xtest, open('./Xtest.data', 'wb'))

In [50]:
from sklearn.decomposition import PCA

  return f(*args, **kwds)


In [51]:
pca = PCA(n_components=100)

In [52]:
pca.fit(Xtrain)

PCA(copy=True, iterated_power='auto', n_components=100, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [57]:
Xtrain_pca = pca.transform(Xtrain)
Xtest_pca = pca.transform(Xtest)
display(Xtrain_pca.shape)
display(Xtest_pca.shape)

(1139754, 100)

(506691, 100)

In [58]:
Xtrain_pca = reduce_mem_usage(pd.DataFrame(data=Xtrain_pca, index=Xtrainindex))
Xtest_pca = reduce_mem_usage(pd.DataFrame(data=Xtest_pca, index=Xtestindex))
pickle.dump(Xtrain_pca, open('./Xtrain_pca_100.data', 'wb'))
pickle.dump(Xtest_pca, open('./Xtest_pca_100.data', 'wb'))

Mem. usage decreased to 226.09 Mb (74.3% reduction)
Mem. usage decreased to 100.51 Mb (74.3% reduction)
