In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from statistics import mode 
# Input data files are available in the "./input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("./input"))

['Data_Dictionary.xlsx', 'new_merchant_transactions.csv', 'test.csv', 'merchants.csv', 'historical_transactions.csv', 'all.zip', 'train.csv', 'sample_submission.csv']


In [2]:
train_df = pd.read_csv("./input/train.csv")
test_df = pd.read_csv("./input/test.csv")
merchants_df = pd.read_csv("./input/merchants.csv")
new_merchant_transactions_df = pd.read_csv("./input/new_merchant_transactions.csv", )
historical_transactions_df = pd.read_csv("./input/historical_transactions.csv")

In [3]:
def memory_optimized_df(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))\
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
            
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df        

In [4]:
submission_df = pd.DataFrame({'card_id': test_df["card_id"].values})

In [5]:
merchants_df_ = memory_optimized_df(merchants_df)
new_merchant_transactions_df_ = memory_optimized_df(new_merchant_transactions_df)
historical_transactions_df_ = memory_optimized_df(historical_transactions_df)

del merchants_df
del new_merchant_transactions_df
del historical_transactions_df

Memory usage of dataframe is 56.18 MB
Memory usage after optimization is: 32.66 MB
Decreased by 41.9%
Memory usage of dataframe is 209.67 MB
Memory usage after optimization is: 169.08 MB
Decreased by 19.4%
Memory usage of dataframe is 3109.54 MB
Memory usage after optimization is: 1622.97 MB
Decreased by 47.8%


In [9]:
new_merchant_transactions_df_['category_2'].fillna(1.0, inplace=True)
new_merchant_transactions_df_['category_3'].fillna('A', inplace=True)

historical_transactions_df_['category_2'].fillna(1.0, inplace=True)
historical_transactions_df_['category_3'].fillna('A', inplace=True)

In [10]:
new_merchant_transactions_df_ = new_merchant_transactions_df_.dropna()
historical_transactions_df_al_transactions_df_ = historical_transactions_df_.dropna()

In [11]:
train_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [12]:
test_df.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3
0,2017-04,C_ID_0ab67a22ab,3,3,1
1,2017-01,C_ID_130fd0cbdd,2,3,0
2,2017-08,C_ID_b709037bc5,5,1,1
3,2017-12,C_ID_d27d835a9f,2,1,0
4,2015-12,C_ID_2b5e3df5c2,5,1,1


In [14]:
train_df['first_active_month'] = pd.to_datetime(train_df['first_active_month'])

In [15]:
test_df['first_active_month'] = pd.to_datetime(test_df['first_active_month'])

In [16]:
def to_binary(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y': 1, 'N': 0})
    return df

In [18]:
historical_transactions_df_ = to_binary(historical_transactions_df_)
new_merchant_transactions_df_ = to_binary(new_merchant_transactions_df_)

In [20]:
historical_transactions_df_['category_3'] = historical_transactions_df_['category_3'].astype('category').cat.codes
new_merchant_transactions_df_['category_3'] = new_merchant_transactions_df_['category_3'].astype('category').cat.codes


In [None]:
all_transactions = historical_transactions_df_.append(new_merchant_transactions_df_)