In [2]:
import pandas as pd
import numpy as np

In [3]:
def reduce_memory_usage(df):
    """Reduce memory usage of a DataFrame by downcasting numerical columns."""
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        # Convert integers to more memory-efficient types based on min/max value
        if col_type in ["int8", "int16", "int32", "int64"]:
            min_val = df[col].min()
            max_val = df[col].max()
            if min_val >= 0:
                if max_val <= 2**7 - 1:  # Can fit in int8
                    df[col] = df[col].astype('int8')
                elif max_val <= 2**15 - 1:  # Can fit in int16
                    df[col] = df[col].astype('int16')
                elif max_val <= 2**31 - 1:  # Can fit in int32
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')
            else:  # Handle negative values
                if min_val >= -2**7 and max_val <= 2**7 - 1:
                    df[col] = df[col].astype('int8')
                elif min_val >= -2**15 and max_val <= 2**15 - 1:
                    df[col] = df[col].astype('int16')
                elif min_val >= -2**31 and max_val <= 2**31 - 1:
                    df[col] = df[col].astype('int32')
                else:
                    df[col] = df[col].astype('int64')

        # Convert floats to float32, which uses less memory than float64
        elif col_type in ["float16", "float32", "float64"]:
            df[col] = df[col].astype('float32')

        # Convert object types to category if unique values are low (threshold can be adjusted)
        elif col_type == "object":
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:  # Threshold to convert to category
                df[col] = df[col].astype('category')

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {(1 - end_mem / start_mem) * 100:.2f}%")
    
    return df


In [4]:
# Load the data
merchants = pd.read_csv("../merchants.csv")
historical_transactions = pd.read_csv("../historical_transactions.csv")
new_merchant_transactions = pd.read_csv("../new_merchant_transactions.csv")
test = pd.read_csv("../test.csv")

merchants = merchants.sample(frac=0.05)
historical_transactions = historical_transactions.sample(frac=0.05)
new_merchant_transactions = new_merchant_transactions.sample(frac=0.05)
test = test.sample(frac=0.05)

# Reduce memory usage
historical_transactions = reduce_memory_usage(historical_transactions)
new_merchant_transactions = reduce_memory_usage(new_merchant_transactions)
test = reduce_memory_usage(test)



Memory usage before optimization: 646.41 MB
Memory usage after optimization: 198.39 MB
Reduced by 69.31%
Memory usage before optimization: 43.50 MB
Memory usage after optimization: 23.07 MB
Reduced by 46.96%
Memory usage before optimization: 0.99 MB
Memory usage after optimization: 0.50 MB
Reduced by 49.41%


In [5]:
# Print column names and data types for each dataframe
for df_name, df in zip(["merchants", "historical_transactions", "new_merchant_transactions", "test"], 
                        [merchants, historical_transactions, new_merchant_transactions, test]):
    print(f"DataFrame: {df_name}")
    print(df.dtypes)
    print("\n" + "="*50 + "\n")

DataFrame: merchants
merchant_id                     object
merchant_group_id                int64
merchant_category_id             int64
subsector_id                     int64
numerical_1                    float64
numerical_2                    float64
category_1                      object
most_recent_sales_range         object
most_recent_purchases_range     object
avg_sales_lag3                 float64
avg_purchases_lag3             float64
active_months_lag3               int64
avg_sales_lag6                 float64
avg_purchases_lag6             float64
active_months_lag6               int64
avg_sales_lag12                float64
avg_purchases_lag12            float64
active_months_lag12              int64
category_4                      object
city_id                          int64
state_id                         int64
category_2                     float64
dtype: object


DataFrame: historical_transactions
authorized_flag         category
card_id                 category
city

In [6]:
# Print the size of both dataframes
print(f"Size of historical_transactions: {historical_transactions.shape}")
print(f"Size of new_merchant_transactions: {new_merchant_transactions.shape}")

# Combine both datasets
transactions = pd.concat([historical_transactions, new_merchant_transactions], ignore_index=True)

# Print the size of the combined dataframe
print(f"Size of combined transactions DataFrame: {transactions.shape}")

Size of historical_transactions: (1455618, 14)
Size of new_merchant_transactions: (98152, 14)
Size of combined transactions DataFrame: (1553770, 14)


In [7]:
transactions.head(-1)

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id
0,N,C_ID_0fa2a1c1dd,88,N,1,B,560,M_ID_6cb0602e94,-2,-0.736014,2017-12-17 11:17:06,1.0,16,34
1,Y,C_ID_cee90812c6,17,N,0,A,80,M_ID_03b05c31f0,-5,-0.742550,2017-09-27 22:49:46,4.0,22,37
2,Y,C_ID_09ee9b6e62,160,N,0,A,360,M_ID_ca1efe5c61,-3,-0.266059,2017-01-25 17:08:35,5.0,21,34
3,Y,C_ID_b8870a0499,-1,Y,1,B,561,M_ID_ec24d672a3,0,-0.604156,2018-02-07 11:48:35,,-1,7
4,Y,C_ID_70dc081fef,296,N,0,A,278,M_ID_aba3daabef,-12,-0.741648,2017-02-09 15:17:51,1.0,15,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553764,Y,C_ID_a7aa977fd1,69,N,0,A,884,M_ID_0f3b28292a,1,-0.593637,2017-06-14 12:37:44,1.0,9,27
1553765,Y,C_ID_1d524915f5,69,N,0,A,650,M_ID_9748fcc3c9,1,-0.701828,2018-03-29 11:01:19,1.0,9,29
1553766,Y,C_ID_0dc3e7b15c,88,N,0,A,80,M_ID_559f49dd6b,1,-0.662759,2017-04-29 13:45:28,1.0,16,37
1553767,Y,C_ID_c27ec085a9,69,N,0,A,80,M_ID_1299227213,1,-0.689326,2017-04-29 12:30:47,1.0,9,37


In [8]:
# Step 1: Remove duplicate merchant_id from merchants, keeping only the first occurrence
merchants_unique = merchants.drop_duplicates(subset="merchant_id", keep="first")

# Step 2: Merge transactions with merchants on merchant_id
transactions_merged = transactions.merge(merchants_unique, on="merchant_id", how="left")

# Step 3: Print the shape of the merged DataFrame
print(f"Size of transactions_merged: {transactions_merged.shape}")

# Display the first few rows
display(transactions_merged.head())

Size of transactions_merged: (1553770, 35)


Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,N,C_ID_0fa2a1c1dd,88,N,1,B,560,M_ID_6cb0602e94,-2,-0.736014,...,,,,,,,,,,
1,Y,C_ID_cee90812c6,17,N,0,A,80,M_ID_03b05c31f0,-5,-0.74255,...,,,,,,,,,,
2,Y,C_ID_09ee9b6e62,160,N,0,A,360,M_ID_ca1efe5c61,-3,-0.266059,...,,,,,,,,,,
3,Y,C_ID_b8870a0499,-1,Y,1,B,561,M_ID_ec24d672a3,0,-0.604156,...,,,,,,,,,,
4,Y,C_ID_70dc081fef,296,N,0,A,278,M_ID_aba3daabef,-12,-0.741648,...,,,,,,,,,,


In [9]:
display(transactions_merged.head(-1))

Unnamed: 0,authorized_flag,card_id,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,merchant_id,month_lag,purchase_amount,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,N,C_ID_0fa2a1c1dd,88,N,1,B,560,M_ID_6cb0602e94,-2,-0.736014,...,,,,,,,,,,
1,Y,C_ID_cee90812c6,17,N,0,A,80,M_ID_03b05c31f0,-5,-0.742550,...,,,,,,,,,,
2,Y,C_ID_09ee9b6e62,160,N,0,A,360,M_ID_ca1efe5c61,-3,-0.266059,...,,,,,,,,,,
3,Y,C_ID_b8870a0499,-1,Y,1,B,561,M_ID_ec24d672a3,0,-0.604156,...,,,,,,,,,,
4,Y,C_ID_70dc081fef,296,N,0,A,278,M_ID_aba3daabef,-12,-0.741648,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1553764,Y,C_ID_a7aa977fd1,69,N,0,A,884,M_ID_0f3b28292a,1,-0.593637,...,,,,,,,,,,
1553765,Y,C_ID_1d524915f5,69,N,0,A,650,M_ID_9748fcc3c9,1,-0.701828,...,,,,,,,,,,
1553766,Y,C_ID_0dc3e7b15c,88,N,0,A,80,M_ID_559f49dd6b,1,-0.662759,...,,,,,,,,,,
1553767,Y,C_ID_c27ec085a9,69,N,0,A,80,M_ID_1299227213,1,-0.689326,...,,,,,,,,,,


In [10]:
# Use test as the base and perform an inner join with transactions_merged on "card_id"
final_df = test.merge(transactions_merged, on="card_id", how="inner")

# Reorder columns so that card_id is first
cols = ['card_id'] + [col for col in final_df.columns if col != 'card_id']
final_df = final_df[cols]

# Print the size of the final DataFrame and display the first few rows
print(f"Size of final_df after inner joining with test: {final_df.shape}")
display(final_df.head())

Size of final_df after inner joining with test: (29706, 39)


Unnamed: 0,card_id,first_active_month,feature_1,feature_2,feature_3,authorized_flag,city_id_x,category_1_x,installments,category_3,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,C_ID_a8c9d4d2b1,2017-05,2,1,0,Y,-1,Y,4,C,...,,,,,,,,,,
1,C_ID_5cc67e30b7,2015-11,4,2,0,Y,19,N,1,B,...,,,,,,,,,,
2,C_ID_5cc67e30b7,2015-11,4,2,0,Y,19,N,1,B,...,,,,,,,,,,
3,C_ID_5cc67e30b7,2015-11,4,2,0,Y,19,N,1,B,...,,,,,,,,,,
4,C_ID_9aa9b08732,2017-11,2,1,0,Y,344,N,-1,,...,,,,,,,,,,


In [11]:
final_df["card_id"].nunique()

5363

In [12]:
final_df = reduce_memory_usage(final_df)

Memory usage before optimization: 16.14 MB
Memory usage after optimization: 7.47 MB
Reduced by 53.74%


In [13]:
# Print all column names in final_df
print(final_df.head())

           card_id first_active_month  feature_1  feature_2  feature_3  \
0  C_ID_a8c9d4d2b1            2017-05          2          1          0   
1  C_ID_5cc67e30b7            2015-11          4          2          0   
2  C_ID_5cc67e30b7            2015-11          4          2          0   
3  C_ID_5cc67e30b7            2015-11          4          2          0   
4  C_ID_9aa9b08732            2017-11          2          1          0   

  authorized_flag  city_id_x category_1_x  installments category_3  ...  \
0               Y         -1            Y             4          C  ...   
1               Y         19            N             1          B  ...   
2               Y         19            N             1          B  ...   
3               Y         19            N             1          B  ...   
4               Y        344            N            -1        NaN  ...   

   avg_sales_lag6 avg_purchases_lag6  active_months_lag6  avg_sales_lag12  \
0             NaN          

In [14]:
from datetime import datetime

# Step 1: Convert 'first_active_month' to number of months from today
if 'first_active_month' in final_df.columns:
    final_df['first_active_month'] = pd.to_datetime(final_df['first_active_month'])
    today = datetime.today()
    final_df['first_active_month'] = final_df['first_active_month'].apply(lambda x: (today.year - x.year) * 12 + (today.month - x.month))

# Step 2: Drop 'first_active_month' and 'card_id'
final_df = final_df.drop(columns=['first_active_month'], errors='ignore')

# Step 3: Encode all categorical values into integers, excluding 'card_id'
categorical_cols = final_df.select_dtypes(include=['category', 'object']).columns
categorical_cols = [col for col in categorical_cols if col != 'card_id']  # Exclude card_id

for col in categorical_cols:
    final_df[col] = final_df[col].astype('category').cat.codes

# Step 4: Print the new dataframe with all columns (but not all rows)
print("Updated final_df shape:", final_df.shape)
display(final_df.head())

Updated final_df shape: (29706, 38)


Unnamed: 0,card_id,feature_1,feature_2,feature_3,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,C_ID_a8c9d4d2b1,2,1,0,1,-1,1,4,2,879,...,,,,,,,-1,,,
1,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,68,...,,,,,,,-1,,,
2,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,683,...,,,,,,,-1,,,
3,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,108,...,,,,,,,-1,,,
4,C_ID_9aa9b08732,2,1,0,1,344,0,-1,-1,34,...,,,,,,,-1,,,


In [15]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                            0
feature_1                          0
feature_2                          0
feature_3                          0
authorized_flag                    0
city_id_x                          0
category_1_x                       0
installments                       0
category_3                         0
merchant_category_id_x             0
merchant_id                        0
month_lag                          0
purchase_amount                    0
purchase_date                      0
category_2_x                    2627
state_id_x                         0
subsector_id_x                     0
merchant_group_id              28375
merchant_category_id_y         28375
subsector_id_y                 28375
numerical_1                    28375
numerical_2                    28375
category_1_y                       0
most_recent_sales_range            0
most_recent_purchases_range        0
avg_sales_lag3                 28375
avg_purchas

In [16]:
display(final_df.head())

Unnamed: 0,card_id,feature_1,feature_2,feature_3,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,C_ID_a8c9d4d2b1,2,1,0,1,-1,1,4,2,879,...,,,,,,,-1,,,
1,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,68,...,,,,,,,-1,,,
2,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,683,...,,,,,,,-1,,,
3,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,108,...,,,,,,,-1,,,
4,C_ID_9aa9b08732,2,1,0,1,344,0,-1,-1,34,...,,,,,,,-1,,,


In [17]:
# Replace infinite values with NaN in final_df
final_df = final_df.replace([np.inf, -np.inf], np.nan)

# For numerical columns, fill missing values with the median.
num_cols = final_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    final_df[col] = final_df[col].fillna(final_df[col].median())

# For categorical columns, add "missing" to the categories (if not present) and fill missing values with "missing".
cat_cols = final_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    # Check if column's dtype is categorical using isinstance
    if isinstance(final_df[col].dtype, pd.CategoricalDtype):
        if "missing" not in final_df[col].cat.categories:
            final_df[col] = final_df[col].cat.add_categories("missing")
    final_df[col] = final_df[col].fillna("missing")

In [18]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

Missing values in dataset:
card_id                        0
feature_1                      0
feature_2                      0
feature_3                      0
authorized_flag                0
city_id_x                      0
category_1_x                   0
installments                   0
category_3                     0
merchant_category_id_x         0
merchant_id                    0
month_lag                      0
purchase_amount                0
purchase_date                  0
category_2_x                   0
state_id_x                     0
subsector_id_x                 0
merchant_group_id              0
merchant_category_id_y         0
subsector_id_y                 0
numerical_1                    0
numerical_2                    0
category_1_y                   0
most_recent_sales_range        0
most_recent_purchases_range    0
avg_sales_lag3                 0
avg_purchases_lag3             0
active_months_lag3             0
avg_sales_lag6                 0
avg_purchases_la

In [19]:
display(final_df.head())

Unnamed: 0,card_id,feature_1,feature_2,feature_3,authorized_flag,city_id_x,category_1_x,installments,category_3,merchant_category_id_x,...,avg_sales_lag6,avg_purchases_lag6,active_months_lag6,avg_sales_lag12,avg_purchases_lag12,active_months_lag12,category_4,city_id_y,state_id_y,category_2_y
0,C_ID_a8c9d4d2b1,2,1,0,1,-1,1,4,2,879,...,1.03,1.040545,6.0,1.05,1.058338,12.0,-1,82.0,9.0,1.0
1,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,68,...,1.03,1.040545,6.0,1.05,1.058338,12.0,-1,82.0,9.0,1.0
2,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,683,...,1.03,1.040545,6.0,1.05,1.058338,12.0,-1,82.0,9.0,1.0
3,C_ID_5cc67e30b7,4,2,0,1,19,0,1,1,108,...,1.03,1.040545,6.0,1.05,1.058338,12.0,-1,82.0,9.0,1.0
4,C_ID_9aa9b08732,2,1,0,1,344,0,-1,-1,34,...,1.03,1.040545,6.0,1.05,1.058338,12.0,-1,82.0,9.0,1.0


In [None]:
# Save the cleaned DataFrame as a Parquet file using pyarrow.
final_df.to_parquet("../final_test.csv", index=False)