In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [None]:
def reduce_memory_usage(df):
    """Reduce memory usage of a DataFrame by downcasting numerical columns."""
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        # Convert integers to more memory-efficient types based on min/max value
        if col_type in ["int8", "int16", "int32", "int64"]:
            min_val = df[col].min()
            max_val = df[col].max()
            if min_val >= 0:
                if max_val <= 2**7 - 1:  # Can fit in int8
                    df[col] = df[col].astype('int8')
                elif max_val <= 2**15 - 1:  # Can fit in int16
                    df[col] = df[col].astype('int16')
                elif max_val <= 2**31 - 1:  # Can fit in int32
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')
            else:  # Handle negative values
                if min_val >= -2**7 and max_val <= 2**7 - 1:
                    df[col] = df[col].astype('int8')
                elif min_val >= -2**15 and max_val <= 2**15 - 1:
                    df[col] = df[col].astype('int16')
                elif min_val >= -2**31 and max_val <= 2**31 - 1:
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')

        # Convert floats to float32, which uses less memory than float64
        elif col_type in ["float16", "float32", "float64"]:
            df[col] = df[col].astype('float32')

        # Convert object types to category if unique values are low (threshold can be adjusted)
        elif col_type == "object":
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:  # Threshold to convert to category
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {(1 - end_mem / start_mem) * 100:.2f}%")
    
    return df


In [4]:
# Load the data
merchants = pd.read_csv("../merchants.csv")
historical_transactions = pd.read_csv("../historical_transactions.csv")
new_merchant_transactions = pd.read_csv("../new_merchant_transactions.csv")
train = pd.read_csv("../train.csv")

merchants = merchants.sample(frac=0.05)
historical_transactions = historical_transactions.sample(frac=0.05)
new_merchant_transactions = new_merchant_transactions.sample(frac=0.05)
train = train.sample(frac=0.05)

# Reduce memory usage
historical_transactions = reduce_memory_usage(historical_transactions)
new_merchant_transactions = reduce_memory_usage(new_merchant_transactions)
train = reduce_memory_usage(train)



Memory usage before optimization: 646.41 MB
Memory usage after optimization: 198.40 MB
Reduced by 69.31%
Memory usage before optimization: 43.50 MB
Memory usage after optimization: 23.07 MB
Reduced by 46.97%
Memory usage before optimization: 1.69 MB
Memory usage after optimization: 0.85 MB
Reduced by 49.63%


In [None]:
# Print column names and data types for each dataframe
for df_name, df in zip(["merchants", "historical_transactions", "new_merchant_transactions", "train"], 
                        [merchants, historical_transactions, new_merchant_transactions, train]):
    print(f"DataFrame: {df_name}")
    print(df.dtypes)
    print("\n" + "="*50 + "\n")

DataFrame: merchants
merchant_id                     object
merchant_group_id                int64
merchant_category_id             int64
subsector_id                     int64
numerical_1                    float64
numerical_2                    float64
category_1                      object
most_recent_sales_range         object
most_recent_purchases_range     object
avg_sales_lag3                 float64
avg_purchases_lag3             float64
active_months_lag3               int64
avg_sales_lag6                 float64
avg_purchases_lag6             float64
active_months_lag6               int64
avg_sales_lag12                float64
avg_purchases_lag12            float64
active_months_lag12              int64
category_4                      object
city_id                          int64
state_id                         int64
category_2                     float64
dtype: object


DataFrame: historical_transactions
authorized_flag         category
card_id                 category
city

In [6]:
# Print the size of both dataframes
print(f"Size of historical_transactions: {historical_transactions.shape}")
print(f"Size of new_merchant_transactions: {new_merchant_transactions.shape}")

# Combine both datasets
transactions = pd.concat([historical_transactions, new_merchant_transactions], ignore_index=True)

# Print the size of the combined dataframe
print(f"Size of combined transactions DataFrame: {transactions.shape}")


Size of historical_transactions: (1455618, 14)
Size of new_merchant_transactions: (98152, 14)
Size of combined transactions DataFrame: (1553770, 14)


In [7]:
transactions.head(-1)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,Y,C_ID_158bab0566,218,N,0,A,309,M_ID_78a8f4ceb4,-11,-0.696719,2017-03-20 18:23:35,1.0,12,21
1,Y,C_ID_6020f5445c,25,N,0,A,307,M_ID_382315d1a6,-5,-0.716855,2017-09-27 19:11:23,3.0,7,19
2,N,C_ID_b21cb9e8ec,158,N,0,A,511,M_ID_1ac6bbc867,-4,-0.680791,2017-04-02 21:47:11,1.0,15,7
3,N,C_ID_2c22d50b1d,289,N,1,B,108,M_ID_dfec67ce3a,-11,-0.692812,2017-03-10 13:52:10,5.0,5,34
4,Y,C_ID_f8cad57f86,344,N,0,A,80,M_ID_b47ee75a4e,-9,-0.726306,2017-05-19 00:44:09,2.0,18,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553764,Y,C_ID_43554255ac,69,N,0,A,879,M_ID_00a6ca8a8a,2,-0.100768,2018-04-14 12:40:44,1.0,9,29
1553765,Y,C_ID_3a9beba4cc,291,N,1,B,783,M_ID_781b030a6f,2,-0.536537,2018-02-17 12:06:25,1.0,9,19
1553766,Y,C_ID_50484838cb,57,N,1,B,703,M_ID_8472118804,1,-0.694315,2018-01-06 11:45:25,5.0,5,29
1553767,Y,C_ID_f3552cca35,333,N,1,B,278,M_ID_c16910813a,2,-0.676283,2018-04-27 12:21:20,5.0,21,37


In [8]:
# Step 1: Remove duplicate merchant_id from merchants, keeping only the first occurrence
merchants_unique = merchants.drop_duplicates(subset="merchant_id", keep="first")

In [9]:
# Step 2: Merge transactions with merchants on merchant_id
transactions_merged = transactions.merge(merchants_unique, on="merchant_id", how="left")

In [10]:
# Step 3: Print the shape of the merged DataFrame
print(f"Size of transactions_merged: {transactions_merged.shape}")

Size of transactions_merged: (1553770, 35)


In [11]:
# Display the first few rows
display(transactions_merged.head())

Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,Y,C_ID_158bab0566,218,N,0,A,309,M_ID_78a8f4ceb4,-11,-0.696719,...,,,,,,,,,,
1,Y,C_ID_6020f5445c,25,N,0,A,307,M_ID_382315d1a6,-5,-0.716855,...,,,,,,,,,,
2,N,C_ID_b21cb9e8ec,158,N,0,A,511,M_ID_1ac6bbc867,-4,-0.680791,...,,,,,,,,,,
3,N,C_ID_2c22d50b1d,289,N,1,B,108,M_ID_dfec67ce3a,-11,-0.692812,...,,,,,,,,,,
4,Y,C_ID_f8cad57f86,344,N,0,A,80,M_ID_b47ee75a4e,-9,-0.726306,...,,,,,,,,,,


In [12]:
display(transactions_merged.head(-1))

Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,Y,C_ID_158bab0566,218,N,0,A,309,M_ID_78a8f4ceb4,-11,-0.696719,...,,,,,,,,,,
1,Y,C_ID_6020f5445c,25,N,0,A,307,M_ID_382315d1a6,-5,-0.716855,...,,,,,,,,,,
2,N,C_ID_b21cb9e8ec,158,N,0,A,511,M_ID_1ac6bbc867,-4,-0.680791,...,,,,,,,,,,
3,N,C_ID_2c22d50b1d,289,N,1,B,108,M_ID_dfec67ce3a,-11,-0.692812,...,,,,,,,,,,
4,Y,C_ID_f8cad57f86,344,N,0,A,80,M_ID_b47ee75a4e,-9,-0.726306,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553764,Y,C_ID_43554255ac,69,N,0,A,879,M_ID_00a6ca8a8a,2,-0.100768,...,,,,,,,,,,
1553765,Y,C_ID_3a9beba4cc,291,N,1,B,783,M_ID_781b030a6f,2,-0.536537,...,,,,,,,,,,
1553766,Y,C_ID_50484838cb,57,N,1,B,703,M_ID_8472118804,1,-0.694315,...,,,,,,,,,,
1553767,Y,C_ID_f3552cca35,333,N,1,B,278,M_ID_c16910813a,2,-0.676283,...,,,,,,,,,,


In [13]:
# Step 1: Merge transactions_merged with train on card_id, keeping all columns from train
final_df = transactions_merged.merge(train, on="card_id", how="left")

# Step 2: Drop rows where target is NaN (card_id that don’t exist in train)
final_df = final_df.dropna(subset=['target'])

# Step 3: Reorder columns: card_id first, target last
cols = ['card_id'] + [col for col in final_df.columns if col not in ['card_id', 'target']] + ['target']
final_df = final_df[cols]

# Step 4: Print the size of the final DataFrame
print(f"Size of final_df after dropping unmatched card_ids: {final_df.shape}")

# Display the first few rows
display(final_df.head())

Size of final_df after dropping unmatched card_ids: (48095, 40)


Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,first_active_month,feature_1,feature_2,feature_3,target
38,C_ID_7cfb53771e,N,-1,Y,1,B,755,M_ID_445742726b,-6,-0.716855,...,,,,,,2017-02,2.0,1.0,0.0,2.38405
114,C_ID_8f6e73a2a9,Y,344,N,0,A,387,M_ID_7d14c36a58,-8,-0.731881,...,,,,,,2017-04,3.0,2.0,1.0,-1.13102
262,C_ID_f29a005fba,Y,-1,Y,1,B,839,M_ID_e5374dabc0,-1,-0.722745,...,,,,,,2017-01,1.0,1.0,0.0,0.499893
271,C_ID_9b1f626be5,N,69,N,1,B,792,M_ID_6726c3223e,-6,-0.296112,...,,,,,,2017-05,2.0,1.0,0.0,1.14853
286,C_ID_c32ea06010,Y,183,N,1,B,307,M_ID_df8f18386b,-4,-0.701828,...,,,,,,2017-03,4.0,1.0,0.0,-0.400508


In [14]:
final_df["card_id"].nunique()

8801

In [15]:
final_df = reduce_memory_usage(final_df)

Memory usage before optimization: 27.63 MB
Memory usage after optimization: 11.90 MB
Reduced by 56.94%


In [16]:
# Print all column names in final_df
print(final_df.columns.tolist())

['card_id', 'authorized_flag', 'city_id_x', 'category_1_x', 'installments', 'category_3', 'merchant_category_id_x', 'merchant_id', 'month_lag', 'purchase_amount', 'purchase_date', 'category_2_x', 'state_id_x', 'subsector_id_x', 'merchant_group_id', 'merchant_category_id_y', 'subsector_id_y', 'numerical_1', 'numerical_2', 'category_1_y', 'most_recent_sales_range', 'most_recent_purchases_range', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12', 'category_4', 'city_id_y', 'state_id_y', 'category_2_y', 'first_active_month', 'feature_1', 'feature_2', 'feature_3', 'target']


In [17]:
from datetime import datetime

# Step 1: Convert 'first_active_month' to number of months from today
if 'first_active_month' in final_df.columns:
    final_df['first_active_month'] = pd.to_datetime(final_df['first_active_month'])
    today = datetime.today()
    final_df['first_active_month'] = final_df['first_active_month'].apply(lambda x: (today.year - x.year) * 12 + (today.month - x.month))

# Step 2: Drop 'first_active_month' and 'card_id'
final_df = final_df.drop(columns=['first_active_month'], errors='ignore')

# Step 3: Encode all categorical values into integers, excluding 'card_id'
categorical_cols = final_df.select_dtypes(include=['category', 'object']).columns
categorical_cols = [col for col in categorical_cols if col != 'card_id']  # Exclude card_id

for col in categorical_cols:
    final_df[col] = final_df[col].astype('category').cat.codes

# Step 4: Print the new dataframe with all columns (but not all rows)
print("Updated final_df shape:", final_df.shape)
display(final_df.head())

Updated final_df shape: (48095, 39)


Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
38,C_ID_7cfb53771e,0,-1,1,1,1,755,5999,-6,-0.716855,...,,,-1,,,,2.0,1.0,0.0,2.38405
114,C_ID_8f6e73a2a9,1,344,0,0,0,387,10961,-8,-0.731881,...,,,-1,,,,3.0,2.0,1.0,-1.13102
262,C_ID_f29a005fba,1,-1,1,1,1,839,20064,-1,-0.722745,...,,,-1,,,,1.0,1.0,0.0,0.499893
271,C_ID_9b1f626be5,0,69,0,1,1,792,9022,-6,-0.296112,...,,,-1,,,,2.0,1.0,0.0,1.14853
286,C_ID_c32ea06010,1,183,0,1,1,307,19574,-4,-0.701828,...,,,-1,,,,4.0,1.0,0.0,-0.400508


In [18]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                            0
authorized_flag                    0
city_id_x                          0
category_1_x                       0
installments                       0
category_3                         0
merchant_category_id_x             0
merchant_id                        0
month_lag                          0
purchase_amount                    0
purchase_date                      0
category_2_x                    4117
state_id_x                         0
subsector_id_x                     0
merchant_group_id              46009
merchant_category_id_y         46009
subsector_id_y                 46009
numerical_1                    46009
numerical_2                    46009
category_1_y                       0
most_recent_sales_range            0
most_recent_purchases_range        0
avg_sales_lag3                 46009
avg_purchases_lag3             46009
active_months_lag3             46009
avg_sales_lag6                 46009
avg_purchas

In [19]:
display(final_df.head())

Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
38,C_ID_7cfb53771e,0,-1,1,1,1,755,5999,-6,-0.716855,...,,,-1,,,,2.0,1.0,0.0,2.38405
114,C_ID_8f6e73a2a9,1,344,0,0,0,387,10961,-8,-0.731881,...,,,-1,,,,3.0,2.0,1.0,-1.13102
262,C_ID_f29a005fba,1,-1,1,1,1,839,20064,-1,-0.722745,...,,,-1,,,,1.0,1.0,0.0,0.499893
271,C_ID_9b1f626be5,0,69,0,1,1,792,9022,-6,-0.296112,...,,,-1,,,,2.0,1.0,0.0,1.14853
286,C_ID_c32ea06010,1,183,0,1,1,307,19574,-4,-0.701828,...,,,-1,,,,4.0,1.0,0.0,-0.400508


In [20]:
# Replace infinite values with NaN in final_df
final_df = final_df.replace([np.inf, -np.inf], np.nan)

# For numerical columns, fill missing values with the median.
num_cols = final_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    final_df[col] = final_df[col].fillna(final_df[col].median())

# For categorical columns, add "missing" to the categories (if not present) and fill missing values with "missing".
cat_cols = final_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    # Check if column's dtype is categorical using isinstance
    if isinstance(final_df[col].dtype, pd.CategoricalDtype):
        if "missing" not in final_df[col].cat.categories:
            final_df[col] = final_df[col].cat.add_categories("missing")
    final_df[col] = final_df[col].fillna("missing")

In [21]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                        0
authorized_flag                0
city_id_x                      0
category_1_x                   0
installments                   0
category_3                     0
merchant_category_id_x         0
merchant_id                    0
month_lag                      0
purchase_amount                0
purchase_date                  0
category_2_x                   0
state_id_x                     0
subsector_id_x                 0
merchant_group_id              0
merchant_category_id_y         0
subsector_id_y                 0
numerical_1                    0
numerical_2                    0
category_1_y                   0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_lag6             0
active_months_lag6             0
avg_sales_lag12                0
avg_purchases_la

In [22]:
display(final_df.head())

Unnamed: 0,card_id,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y,feature_1,feature_2,feature_3,target
38,C_ID_7cfb53771e,0,-1,1,1,1,755,5999,-6,-0.716855,...,1.045006,12.0,-1,69.0,9.0,1.0,2.0,1.0,0.0,2.38405
114,C_ID_8f6e73a2a9,1,344,0,0,0,387,10961,-8,-0.731881,...,1.045006,12.0,-1,69.0,9.0,1.0,3.0,2.0,1.0,-1.13102
262,C_ID_f29a005fba,1,-1,1,1,1,839,20064,-1,-0.722745,...,1.045006,12.0,-1,69.0,9.0,1.0,1.0,1.0,0.0,0.499893
271,C_ID_9b1f626be5,0,69,0,1,1,792,9022,-6,-0.296112,...,1.045006,12.0,-1,69.0,9.0,1.0,2.0,1.0,0.0,1.14853
286,C_ID_c32ea06010,1,183,0,1,1,307,19574,-4,-0.701828,...,1.045006,12.0,-1,69.0,9.0,1.0,4.0,1.0,0.0,-0.400508


In [24]:
# Save the cleaned DataFrame as a Parquet file using pyarrow.
final_df.to_parquet("../final_dataset.csv", index=False)