# Analysis

## 1. Setup

In [1]:
import utils.plot as plot

import pandas as pd
import numpy as np

from scipy.stats import f_oneway

import seaborn as sns

In [2]:
sns.set_theme(style="whitegrid")

In [3]:
TRAIN_PATH = "data/fraudTrain.csv"
TEST_PATH = "data/fraudTest.csv"

train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

## 2. Data Overview

In [None]:
print("Shapes:" f"\n  Train: {train_df.shape}" f"\n  Test: {test_df.shape}")

In [None]:
class_distribution_pct = train_df['is_fraud'].value_counts(normalize=True) * 100
print(f"Positive Class Distribution: {class_distribution_pct[1]:.2f}%")

In [None]:
train_df.info()
test_df.info()

In [None]:
train_df.head(10)

In [None]:
train_df.describe()

## 3. Data cleaning

### 3.1 Drop unnecessary

In [None]:
del_col_unnamed = train_df.pop("Unnamed: 0")
del_col_unnamed

In [None]:
del_col_unix_time = train_df.pop("unix_time")
del_col_unix_time

In [None]:
del_col_trans_num = train_df.pop("trans_num")
del_col_trans_num

In [None]:
del_col_first = train_df.pop("first")
del_col_last = train_df.pop("last")
pd.concat([del_col_first, del_col_last], axis=1)

### 3.2 Change types

In [None]:
categories = [
    "merchant",
    "category",
    "street",
    "city",
    "state",
    "job",
    "cc_num",
    "gender",
]
train_df[categories] = train_df[categories].astype("category")
train_df[categories]

In [None]:
train_df["trans_date_trans_time"] = pd.to_datetime(train_df["trans_date_trans_time"])
train_df["dob"] = pd.to_datetime(train_df["dob"], format="%Y-%m-%d")
train_df[["trans_date_trans_time", "dob"]].head()

In [None]:
train_df.head()

### 3.3 Check for duplicates

In [None]:
train_df[train_df.duplicated()].shape[0]

## 4. Unvariate Analysis

In [None]:
cat_features = train_df.select_dtypes("category").drop(["cc_num"], axis=1).columns
dt_features = train_df.select_dtypes("datetime").columns
num_features = train_df.select_dtypes("number").drop("is_fraud", axis=1).columns

cat_features, dt_features, num_features

In [None]:
plot.num_features(num_features, train_df)

## 5. Bivariate Analysis

In [None]:
plot.bi_num_features(num_features[1:], train_df, 3)

In [None]:
plot.bi_cat_features(cat_features, train_df)

## 6. Feature Engineering

In [None]:
train_df.head()

### 6.1 Date and time

In [214]:
def encode_cyclic_features(df: pd.DataFrame, col: str, period: int):
    new_df = df.copy()
    new_df[f"{col}_sin"] = np.sin(2 * np.pi * new_df[col] / period)
    new_df[f"{col}_cos"] = np.cos(2 * np.pi * new_df[col] / period)
    return new_df

#### 6.1.1 Basic

In [215]:
train_df["hour"] = train_df["trans_date_trans_time"].dt.hour  # type: ignore
train_df["day_of_week"] = train_df["trans_date_trans_time"].dt.dayofweek  # type: ignore
train_df["month"] = train_df["trans_date_trans_time"].dt.month  # type: ignore

In [216]:
train_df = encode_cyclic_features(train_df, "hour", 24)
train_df = encode_cyclic_features(train_df, "day_of_week", 7)
train_df = encode_cyclic_features(train_df, "month", 12)

#### 6.1.2 Periods

In [217]:
train_df["is_weekend"] = train_df["day_of_week"] >= 5
train_df["is_night"] = (train_df["hour"] >= 0) & (train_df["hour"] <= 5)

#### 6.1.3 Time since ...

In [218]:
train_df["time_since_last_trans_cc"] = train_df.groupby("cc_num", observed=True)["trans_date_trans_time"].diff().dt.seconds  # type: ignore

In [219]:
train_df["first_trans_cc"] = train_df["time_since_last_trans_cc"].isna()
train_df["time_since_last_trans_cc"] = train_df["time_since_last_trans_cc"].fillna(0)

In [None]:
train_df.sort_values(["cc_num", "trans_date_trans_time"])[
    ["time_since_last_trans_cc", "first_trans_cc"]
]

In [221]:
train_df["time_since_last_trans_merchant"] = train_df.groupby("merchant", observed=True)["trans_date_trans_time"].diff().dt.seconds  # type: ignore

In [222]:
train_df["first_trans_merchant"] = train_df["time_since_last_trans_merchant"].isna()
train_df["time_since_last_trans_merchant"] = train_df[
    "time_since_last_trans_merchant"
].fillna(0)

In [None]:
train_df.sort_values(["merchant", "trans_date_trans_time"])[
    ["time_since_last_trans_merchant", "first_trans_merchant"]
]

#### 6.1.4 Frequency

In [224]:
def rolling_frequency(df, window, unit):
    result = pd.Series(0, index=df.index)

    for _, group in df.groupby("cc_num", sort=False, observed=False):
        counts = group.rolling(f"{window}{unit}", on="trans_date_trans_time")[
            "cc_num"
        ].count()
        result.loc[group.index] = counts

    return result

In [225]:
train_df.sort_values("trans_date_trans_time", inplace=True)

train_df["frequency_1m"] = rolling_frequency(train_df, 1, "min")
train_df["frequency_5m"] = rolling_frequency(train_df, 5, "min")
train_df["frequency_1h"] = rolling_frequency(train_df, 1, "h")
train_df["frequency_1d"] = rolling_frequency(train_df, 1, "D")
train_df["frequency_7d"] = rolling_frequency(train_df, 7, "D")

In [None]:
train_df[
    ["frequency_1m", "frequency_5m", "frequency_1h", "frequency_1d", "frequency_7d"]
]

### 6.2 Is new ...

In [227]:
train_df["is_new_street"] = ~train_df.duplicated(["cc_num", "street"], keep="first")
train_df["is_new_city"] = ~train_df.duplicated(["cc_num", "city"], keep="first")
train_df["is_new_state"] = ~train_df.duplicated(["cc_num", "state"], keep="first")

In [None]:
train_df[["cc_num", "is_new_street", "is_new_city", "is_new_state"]]

### 6.3 Merchant

In [229]:
train_df["category_base"] = (
    train_df["category"].str.replace(r"_(pos|net)$", "", regex=True).astype("category")
)

In [None]:
train_df[["category", "category_base"]]

In [None]:
train_df["channel"] = "other"
train_df.loc[train_df["category"].str.endswith("_net"), "channel"] = "net"
train_df.loc[train_df["category"].str.endswith("_pos"), "channel"] = "pos"

train_df["channel"] = train_df["channel"].astype("category")
train_df[["category", "channel"]]

### 6.4 Personality

#### 6.4.1 Age

In [232]:
train_df["age_at_trans"] = (train_df["trans_date_trans_time"] - train_df["dob"]).dt.total_seconds()  # type: ignore

In [None]:
train_df["age_at_trans"]

#### 6.4.2 Location

In [234]:
home_city = (
    train_df.groupby("cc_num", observed=False)["city"]
    .apply(pd.Series.mode)
    .reset_index(level=1, drop=True)
)
home_state = (
    train_df.groupby("cc_num", observed=False)["state"]
    .apply(pd.Series.mode)
    .reset_index(level=1, drop=True)
)
home_long = train_df.groupby("cc_num", observed=False)["long"].median()
home_lat = train_df.groupby("cc_num", observed=False)["lat"].median()

In [236]:
train_df["is_home_city"] = train_df["city"] == train_df["cc_num"].map(home_city)
train_df["is_home_state"] = train_df["state"] == train_df["cc_num"].map(home_state)

train_df["dist_to_home_long"] = train_df["merch_long"] - train_df["cc_num"].map(
    home_long
)
train_df["dist_to_home_lat"] = train_df["merch_lat"] - train_df["cc_num"].map(home_lat)

In [None]:
train_df[
    [
        "city",
        "state",
        "is_home_city",
        "is_home_state",
        "dist_to_home_long",
        "dist_to_home_lat",
    ]
]

#### 6.4.3 Usual

In [238]:
favourite_category = (
    train_df.groupby("cc_num", observed=False)["category_base"]
    .apply(pd.Series.mode)
    .reset_index(level=1, drop=True)
)
favourite_category = favourite_category[
    ~favourite_category.index.duplicated(keep="first")
]

median_amt = train_df.groupby("cc_num", observed=False)["amt"].median()

In [239]:
train_df["is_fav_category"] = train_df["category_base"] == train_df["cc_num"].map(
    favourite_category
)
train_df["median_amt"] = train_df["cc_num"].map(median_amt)

In [None]:
train_df[["category_base", "is_fav_category", "amt", "median_amt"]]

## 7. Feature Importance Assessment

In [None]:
train_df.head()

In [242]:
bool_cols = train_df.select_dtypes(include="bool").columns
train_df[bool_cols] = train_df[bool_cols].astype(int)

In [248]:
train_df["dob"] = train_df["dob"].astype("int64") // 10**9

In [52]:
correlations = train_df.select_dtypes("number").corr()
correlations = correlations.corr().abs()
top_correlations = correlations["is_fraud"].sort_values(ascending=False)
correlations = correlations.reindex(
    index=top_correlations.index, columns=top_correlations.index
)

In [None]:
correlations["is_fraud"]

In [None]:
plot.correlation_matrix(correlations)

In [252]:
train_df.drop(["is_home_city", "is_home_state"], axis=1, inplace=True)

In [None]:
train_df.head()

In [254]:
def single_anova_analysis(df: pd.DataFrame, col: str):
    groups = [df[df[col] == cat]["is_fraud"] for cat in df[col].unique()]
    f_stat, p_value = f_oneway(*groups)
    return f_stat, p_value


def multi_anova_analysis(df: pd.DataFrame, cols: list):
    results = []

    for col in cols:
        f_stat, p_value = single_anova_analysis(df, col)
        results.append({"feature": col, "f_stat": f_stat, "p_value": p_value})

    return pd.DataFrame(results).set_index("feature")

In [None]:
anova_results = multi_anova_analysis(train_df, list(train_df.select_dtypes("category")))
anova_results