In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
def reduce_memory_usage(df):
    """Reduce memory usage of a DataFrame by downcasting numerical columns."""
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        # Convert integers
        if col_type in ["int16", "int32", "int64"]:
            df[col] = pd.to_numeric(df[col], downcast="integer")

        # Convert floats
        elif col_type in ["float16", "float32", "float64"]:
            df[col] = pd.to_numeric(df[col], downcast="float")

        # Convert object types to category if unique values are low
        elif col_type == "object":
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:  # Threshold to convert to category
                df[col] = df[col].astype("category")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {(1 - end_mem / start_mem) * 100:.2f}%")
    
    return df

In [3]:
merchants = pd.read_csv("../../raw-data/merchants.csv")
merchants = reduce_memory_usage(merchants)

historical_transactions = pd.read_csv("../../raw-data/historical_transactions.csv")
historical_transactions = reduce_memory_usage(historical_transactions)

new_merchant_transactions = pd.read_csv("../../raw-data/new_merchant_transactions.csv")
new_merchant_transactions = reduce_memory_usage(new_merchant_transactions)

train = pd.read_csv("../../raw-data/train.csv")
train = reduce_memory_usage(train)

Memory usage before optimization: 127.68 MB
Memory usage after optimization: 43.73 MB
Reduced by 65.75%
Memory usage before optimization: 11375.84 MB
Memory usage after optimization: 2832.28 MB
Reduced by 75.10%
Memory usage before optimization: 765.80 MB
Memory usage after optimization: 227.35 MB
Reduced by 70.31%
Memory usage before optimization: 29.27 MB
Memory usage after optimization: 13.87 MB
Reduced by 52.61%


In [4]:
# Print column names and data types for each dataframe
for df_name, df in zip(["merchants", "historical_transactions", "new_merchant_transactions", "train"], 
                        [merchants, historical_transactions, new_merchant_transactions, train]):
    print(f"DataFrame: {df_name}")
    print(df.dtypes)
    print("\n" + "="*50 + "\n")

DataFrame: merchants
merchant_id                      object
merchant_group_id                 int32
merchant_category_id              int16
subsector_id                       int8
numerical_1                     float32
numerical_2                     float32
category_1                     category
most_recent_sales_range        category
most_recent_purchases_range    category
avg_sales_lag3                  float64
avg_purchases_lag3              float64
active_months_lag3                 int8
avg_sales_lag6                  float64
avg_purchases_lag6              float32
active_months_lag6                 int8
avg_sales_lag12                 float64
avg_purchases_lag12             float64
active_months_lag12                int8
category_4                     category
city_id                           int16
state_id                           int8
category_2                      float32
dtype: object


DataFrame: historical_transactions
authorized_flag         category
card_id        

In [5]:
# Print the size of both dataframes
print(f"Size of historical_transactions: {historical_transactions.shape}")
print(f"Size of new_merchant_transactions: {new_merchant_transactions.shape}")

# Combine both datasets
transactions = pd.concat([historical_transactions, new_merchant_transactions], ignore_index=True)

# Print the size of the combined dataframe
print(f"Size of combined transactions DataFrame: {transactions.shape}")


Size of historical_transactions: (29112361, 14)
Size of new_merchant_transactions: (1963031, 14)
Size of combined transactions DataFrame: (31075392, 14)


In [6]:
transactions.head(-1)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,2017-06-25 15:33:07,1.0,16,37
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,2017-07-15 12:10:45,1.0,16,16
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,2017-08-09 22:04:29,1.0,16,37
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,2017-09-02 10:06:26,1.0,16,34
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,2017-03-10 01:14:19,1.0,16,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31075386,Y,C_ID_d3ad1091dc,51,N,0,A,560,M_ID_feaccfbea2,2,-0.743527,2017-12-20 13:48:02,1.0,16,34
31075387,Y,C_ID_1320dee851,142,N,0,A,309,M_ID_7754b67f3b,2,-0.701828,2018-04-06 14:36:52,3.0,19,21
31075388,Y,C_ID_f112aa3381,158,N,0,A,560,M_ID_da063195b7,2,-0.694390,2018-03-07 13:19:18,1.0,15,34
31075389,Y,C_ID_bd97b86450,69,N,1,B,278,M_ID_9a9ccb6544,1,-0.621031,2018-03-05 12:04:56,1.0,9,37


In [7]:
# Step 1: Remove duplicate merchant_id from merchants, keeping only the first occurrence
merchants_unique = merchants.drop_duplicates(subset="merchant_id", keep="first")

# Step 2: Merge transactions with merchants on merchant_id
transactions_merged = transactions.merge(merchants_unique, on="merchant_id", how="left")

# Step 3: Print the shape of the merged DataFrame
print(f"Size of transactions_merged: {transactions_merged.shape}")

# Display the first few rows
display(transactions_merged.head())


Size of transactions_merged: (31075392, 35)


Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,1.14,1.114136,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,...,1.06,1.058605,6.0,1.05,1.062087,12.0,Y,88.0,16.0,1.0
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,0.98,0.967058,6.0,0.97,0.956668,12.0,Y,88.0,16.0,1.0
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,0.88,0.897406,6.0,0.86,0.864394,12.0,Y,88.0,16.0,1.0
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,1.14,1.114136,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0


In [8]:
display(transactions_merged.head(-1))

Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-8,-0.703331,...,1.14,1.114136,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
1,Y,C_ID_4e6213e9bc,88,N,0,A,367,M_ID_86ec983688,-7,-0.733128,...,1.06,1.058605,6.0,1.05,1.062087,12.0,Y,88.0,16.0,1.0
2,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_979ed661fc,-6,-0.720386,...,0.98,0.967058,6.0,0.97,0.956668,12.0,Y,88.0,16.0,1.0
3,Y,C_ID_4e6213e9bc,88,N,0,A,560,M_ID_e6d5ae8ea6,-5,-0.735352,...,0.88,0.897406,6.0,0.86,0.864394,12.0,Y,88.0,16.0,1.0
4,Y,C_ID_4e6213e9bc,88,N,0,A,80,M_ID_e020e9b302,-11,-0.722865,...,1.14,1.114136,6.0,1.19,1.156844,12.0,Y,88.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31075386,Y,C_ID_d3ad1091dc,51,N,0,A,560,M_ID_feaccfbea2,2,-0.743527,...,0.86,0.842120,6.0,0.68,0.654762,12.0,N,51.0,16.0,1.0
31075387,Y,C_ID_1320dee851,142,N,0,A,309,M_ID_7754b67f3b,2,-0.701828,...,1.39,1.185430,6.0,1.45,1.172737,12.0,N,142.0,19.0,3.0
31075388,Y,C_ID_f112aa3381,158,N,0,A,560,M_ID_da063195b7,2,-0.694390,...,0.90,0.885124,6.0,0.88,0.875207,12.0,Y,158.0,15.0,1.0
31075389,Y,C_ID_bd97b86450,69,N,1,B,278,M_ID_9a9ccb6544,1,-0.621031,...,1.01,1.042102,6.0,0.99,1.028358,12.0,Y,69.0,9.0,1.0


In [9]:
# Step 1: Merge transactions_merged with train on card_id, keeping all columns from train
final_df = transactions_merged.merge(train, on="card_id", how="left")

# Step 2: Drop rows where target is NaN (card_id that don’t exist in train)
final_df = final_df.dropna(subset=['target'])

# Step 3: Reorder columns: card_id first, target last
cols = ['card_id'] + [col for col in final_df.columns if col not in ['card_id', 'target']] + ['target']
final_df = final_df[cols]

# Step 4: Print the size of the final DataFrame
print(f"Size of final_df after dropping unmatched card_ids: {final_df.shape}")

# Display the first few rows
display(final_df.head())

Size of final_df after dropping unmatched card_ids: (19249694, 40)


Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,first_active_month,feature_1,feature_2,feature_3,target
400,C_ID_5037ff576e,N,322,N,1,B,278,M_ID_b61c7d1be0,-3,-0.59526,...,12.0,N,-1.0,11.0,3.0,2017-01,5.0,1.0,1.0,-2.352713
401,C_ID_5037ff576e,Y,138,N,1,B,307,M_ID_fe69229f24,-4,1.189469,...,12.0,N,-1.0,15.0,1.0,2017-01,5.0,1.0,1.0,-2.352713
402,C_ID_5037ff576e,Y,138,N,1,B,705,M_ID_efc106141c,-9,-0.640069,...,12.0,N,-1.0,15.0,1.0,2017-01,5.0,1.0,1.0,-2.352713
403,C_ID_5037ff576e,Y,226,N,1,B,307,M_ID_708022307c,-4,-0.652256,...,12.0,N,-1.0,16.0,1.0,2017-01,5.0,1.0,1.0,-2.352713
404,C_ID_5037ff576e,Y,330,N,1,B,705,M_ID_393b4b8cec,-9,-0.67421,...,12.0,N,-1.0,17.0,3.0,2017-01,5.0,1.0,1.0,-2.352713


In [10]:
final_df["card_id"].nunique()

201917

In [11]:
final_df = reduce_memory_usage(final_df)

Memory usage before optimization: 7890.80 MB
Memory usage after optimization: 4031.27 MB
Reduced by 48.91%


In [12]:
# Print all column names in final_df
print(final_df.columns.tolist())

['card_id', 'authorized_flag', 'city_id_x', 'category_1_x', 'installments', 'category_3', 'merchant_category_id_x', 'merchant_id', 'month_lag', 'purchase_amount', 'purchase_date', 'category_2_x', 'state_id_x', 'subsector_id_x', 'merchant_group_id', 'merchant_category_id_y', 'subsector_id_y', 'numerical_1', 'numerical_2', 'category_1_y', 'most_recent_sales_range', 'most_recent_purchases_range', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12', 'category_4', 'city_id_y', 'state_id_y', 'category_2_y', 'first_active_month', 'feature_1', 'feature_2', 'feature_3', 'target']


In [13]:
from datetime import datetime

# Step 1: Convert 'first_active_month' to number of months from today
if 'first_active_month' in final_df.columns:
    final_df['first_active_month'] = pd.to_datetime(final_df['first_active_month'])
    today = datetime.today()
    final_df['first_active_month'] = final_df['first_active_month'].apply(lambda x: (today.year - x.year) * 12 + (today.month - x.month))

# Step 2: Drop 'first_active_month' and 'card_id'
final_df = final_df.drop(columns=['first_active_month'], errors='ignore')

# Step 3: Encode all categorical values into integers, excluding 'card_id'
categorical_cols = final_df.select_dtypes(include=['category', 'object']).columns
categorical_cols = [col for col in categorical_cols if col != 'card_id']  # Exclude card_id

for col in categorical_cols:
    final_df[col] = final_df[col].astype('category').cat.codes

# Step 4: Print the new dataframe with all columns (but not all rows)
print("Updated final_df shape:", final_df.shape)
display(final_df.head())

Updated final_df shape: (19249694, 39)


Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
400,C_ID_5037ff576e,0,322,0,1,1,278,223191,-3,-0.59526,...,0.91968,12.0,0,-1.0,11.0,3.0,5.0,1.0,1.0,-2.352713
401,C_ID_5037ff576e,1,138,0,1,1,307,311572,-4,1.189469,...,0.947394,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
402,C_ID_5037ff576e,1,138,0,1,1,705,293697,-9,-0.640069,...,1.021108,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
403,C_ID_5037ff576e,1,226,0,1,1,307,137680,-4,-0.652256,...,1.014457,12.0,0,-1.0,16.0,1.0,5.0,1.0,1.0,-2.352713
404,C_ID_5037ff576e,1,330,0,1,1,705,69949,-9,-0.67421,...,1.026715,12.0,0,-1.0,17.0,3.0,5.0,1.0,1.0,-2.352713


In [15]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                              0
authorized_flag                      0
city_id_x                            0
category_1_x                         0
installments                         0
category_3                           0
merchant_category_id_x               0
merchant_id                          0
month_lag                            0
purchase_amount                      0
purchase_date                        0
category_2_x                   1706443
state_id_x                           0
subsector_id_x                       0
merchant_group_id               102297
merchant_category_id_y          102297
subsector_id_y                  102297
numerical_1                     102297
numerical_2                     102297
category_1_y                         0
most_recent_sales_range              0
most_recent_purchases_range          0
avg_sales_lag3                  108140
avg_purchases_lag3              102297
active_months_lag3              10229

In [18]:
display(final_df.head())

Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
400,C_ID_5037ff576e,0,322,0,1,1,278,223191,-3,-0.59526,...,0.91968,12.0,0,-1.0,11.0,3.0,5.0,1.0,1.0,-2.352713
401,C_ID_5037ff576e,1,138,0,1,1,307,311572,-4,1.189469,...,0.947394,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
402,C_ID_5037ff576e,1,138,0,1,1,705,293697,-9,-0.640069,...,1.021108,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
403,C_ID_5037ff576e,1,226,0,1,1,307,137680,-4,-0.652256,...,1.014457,12.0,0,-1.0,16.0,1.0,5.0,1.0,1.0,-2.352713
404,C_ID_5037ff576e,1,330,0,1,1,705,69949,-9,-0.67421,...,1.026715,12.0,0,-1.0,17.0,3.0,5.0,1.0,1.0,-2.352713


In [20]:
# Replace infinite values with NaN in final_df
final_df = final_df.replace([np.inf, -np.inf], np.nan)

# For numerical columns, fill missing values with the median.
num_cols = final_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    final_df[col] = final_df[col].fillna(final_df[col].median())

# For categorical columns, add "missing" to the categories (if not present) and fill missing values with "missing".
cat_cols = final_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    # Check if column's dtype is categorical using isinstance
    if isinstance(final_df[col].dtype, pd.CategoricalDtype):
        if "missing" not in final_df[col].cat.categories:
            final_df[col] = final_df[col].cat.add_categories("missing")
    final_df[col] = final_df[col].fillna("missing")

In [21]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                        0
authorized_flag                0
city_id_x                      0
category_1_x                   0
installments                   0
category_3                     0
merchant_category_id_x         0
merchant_id                    0
month_lag                      0
purchase_amount                0
purchase_date                  0
category_2_x                   0
state_id_x                     0
subsector_id_x                 0
merchant_group_id              0
merchant_category_id_y         0
subsector_id_y                 0
numerical_1                    0
numerical_2                    0
category_1_y                   0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_la

In [22]:
display(final_df.head())

Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
400,C_ID_5037ff576e,0,322,0,1,1,278,223191,-3,-0.59526,...,0.91968,12.0,0,-1.0,11.0,3.0,5.0,1.0,1.0,-2.352713
401,C_ID_5037ff576e,1,138,0,1,1,307,311572,-4,1.189469,...,0.947394,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
402,C_ID_5037ff576e,1,138,0,1,1,705,293697,-9,-0.640069,...,1.021108,12.0,0,-1.0,15.0,1.0,5.0,1.0,1.0,-2.352713
403,C_ID_5037ff576e,1,226,0,1,1,307,137680,-4,-0.652256,...,1.014457,12.0,0,-1.0,16.0,1.0,5.0,1.0,1.0,-2.352713
404,C_ID_5037ff576e,1,330,0,1,1,705,69949,-9,-0.67421,...,1.026715,12.0,0,-1.0,17.0,3.0,5.0,1.0,1.0,-2.352713


In [23]:
# Save the cleaned DataFrame as a Parquet file using pyarrow.
final_df.to_parquet("../../datasets/final_dataset.parquet", engine="pyarrow", index=False)