In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [2]:
def reduce_memory_usage(df):
    """Reduce memory usage of a DataFrame by downcasting numerical columns."""
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage before optimization: {start_mem:.2f} MB")

    for col in df.columns:
        col_type = df[col].dtype

        # Convert integers
        if col_type in ["int16", "int32", "int64"]:
            df[col] = pd.to_numeric(df[col], downcast="integer")

        # Convert floats
        elif col_type in ["float16", "float32", "float64"]:
            df[col] = pd.to_numeric(df[col], downcast="float")

        # Convert object types to category if unique values are low
        elif col_type == "object":
            num_unique_values = df[col].nunique()
            num_total_values = len(df[col])
            if num_unique_values / num_total_values < 0.5:  # Threshold to convert to category
                df[col] = df[col].astype("category")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory usage after optimization: {end_mem:.2f} MB")
    print(f"Reduced by {(1 - end_mem / start_mem) * 100:.2f}%")
    
    return df

In [3]:
merchants = pd.read_csv("../../raw-data/merchants.csv")
merchants = reduce_memory_usage(merchants)

historical_transactions = pd.read_csv("../../raw-data/historical_transactions.csv")
historical_transactions = reduce_memory_usage(historical_transactions)

new_merchant_transactions = pd.read_csv("../../raw-data/new_merchant_transactions.csv")
new_merchant_transactions = reduce_memory_usage(new_merchant_transactions)

train = pd.read_csv("../../raw-data/train.csv")
train = reduce_memory_usage(train)

Memory usage before optimization: 127.68 MB
Memory usage after optimization: 43.73 MB
Reduced by 65.75%


KeyboardInterrupt: 

In [None]:
# Print column names and data types for each dataframe
for df_name, df in zip(["merchants", "historical_transactions", "new_merchant_transactions", "train"], 
                        [merchants, historical_transactions, new_merchant_transactions, train]):
    print(f"DataFrame: {df_name}")
    print(df.dtypes)
    print("\n" + "="*50 + "\n")

In [None]:
# Print the size of both dataframes
print(f"Size of historical_transactions: {historical_transactions.shape}")
print(f"Size of new_merchant_transactions: {new_merchant_transactions.shape}")

# Combine both datasets
transactions = pd.concat([historical_transactions, new_merchant_transactions], ignore_index=True)

# Print the size of the combined dataframe
print(f"Size of combined transactions DataFrame: {transactions.shape}")


In [None]:
transactions.head(-1)

In [None]:
# Step 1: Remove duplicate merchant_id from merchants, keeping only the first occurrence
merchants_unique = merchants.drop_duplicates(subset="merchant_id", keep="first")

# Step 2: Merge transactions with merchants on merchant_id
transactions_merged = transactions.merge(merchants_unique, on="merchant_id", how="left")

# Step 3: Print the shape of the merged DataFrame
print(f"Size of transactions_merged: {transactions_merged.shape}")

# Display the first few rows
display(transactions_merged.head())


In [None]:
display(transactions_merged.head(-1))

In [None]:
# Step 1: Merge transactions_merged with train on card_id, keeping all columns from train
final_df = transactions_merged.merge(train, on="card_id", how="left")

# Step 2: Drop rows where target is NaN (card_id that don’t exist in train)
final_df = final_df.dropna(subset=['target'])

# Step 3: Reorder columns: card_id first, target last
cols = ['card_id'] + [col for col in final_df.columns if col not in ['card_id', 'target']] + ['target']
final_df = final_df[cols]

# Step 4: Print the size of the final DataFrame
print(f"Size of final_df after dropping unmatched card_ids: {final_df.shape}")

# Display the first few rows
display(final_df.head())

In [None]:
final_df["card_id"].nunique()

In [None]:
final_df = reduce_memory_usage(final_df)

In [None]:
# Print all column names in final_df
print(final_df.columns.tolist())

In [None]:
from datetime import datetime

# Step 1: Convert 'first_active_month' to number of months from today
if 'first_active_month' in final_df.columns:
    final_df['first_active_month'] = pd.to_datetime(final_df['first_active_month'])
    today = datetime.today()
    final_df['first_active_month'] = final_df['first_active_month'].apply(lambda x: (today.year - x.year) * 12 + (today.month - x.month))

# Step 2: Drop 'first_active_month' and 'card_id'
final_df = final_df.drop(columns=['first_active_month'], errors='ignore')

# Step 3: Encode all categorical values into integers, excluding 'card_id'
categorical_cols = final_df.select_dtypes(include=['category', 'object']).columns
categorical_cols = [col for col in categorical_cols if col != 'card_id']  # Exclude card_id

for col in categorical_cols:
    final_df[col] = final_df[col].astype('category').cat.codes

# Step 4: Print the new dataframe with all columns (but not all rows)
print("Updated final_df shape:", final_df.shape)
display(final_df.head())

In [None]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

In [None]:
display(final_df.head())

In [None]:
# Replace infinite values with NaN in final_df
final_df = final_df.replace([np.inf, -np.inf], np.nan)

# For numerical columns, fill missing values with the median.
num_cols = final_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    final_df[col] = final_df[col].fillna(final_df[col].median())

# For categorical columns, add "missing" to the categories (if not present) and fill missing values with "missing".
cat_cols = final_df.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    # Check if column's dtype is categorical using isinstance
    if isinstance(final_df[col].dtype, pd.CategoricalDtype):
        if "missing" not in final_df[col].cat.categories:
            final_df[col] = final_df[col].cat.add_categories("missing")
    final_df[col] = final_df[col].fillna("missing")

In [None]:
# Check for missing values across all columns
print("Missing values in dataset:")
print(final_df.isnull().sum())

# Check for infinite values in numeric columns only
numeric_df = final_df.select_dtypes(include=[np.number])
print("\nInfinite values in numeric columns:")
print(np.isinf(numeric_df).sum())

In [None]:
display(final_df.head())

In [None]:
# Save the cleaned DataFrame as a Parquet file using pyarrow.
final_df.to_parquet("../../datasets/final_dataset.csv", index=False)