<a href="https://colab.research.google.com/github/agodavarthy/fraudpolicing/blob/main/Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
from google.colab import drive
import math
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder
import sys
import time

In [2]:
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/IK_ProjectUp/FraudDetection/May2/data/"

#train_df = pd.read_csv(file_path+"smaller_fraud.csv", index_col=0)
train_df = pd.read_csv(file_path+"fraudTrain.csv", index_col=0)
test_df = pd.read_csv(file_path+"fraudTest.csv", index_col=0)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_df.shape

(1296675, 22)

In [4]:
test_df.shape

(555719, 22)

In [5]:
df = pd.concat([train_df, test_df], axis=0)

In [6]:
df.shape

(1852394, 22)

In [7]:
def transform_datetime(df):
    if 'trans_date_trans_time' in df.columns and 'dob' in df.columns:
      df['trans_dt'] = pd.to_datetime(df['trans_date_trans_time'])
      df['trans_year'] = df['trans_dt'].dt.year
      df['trans_month'] = df['trans_dt'].dt.month
      df['trans_day'] = df['trans_dt'].dt.day
      df['trans_dayofweek'] = df['trans_dt'].dt.dayofweek
      df['trans_hour'] = df['trans_dt'].dt.hour
      df['dob_dt'] = pd.to_datetime(df['dob'])
    return df

In [8]:
def compute_transaction_freq(df, timestamp_col, entity_col, window='1H'):
    """
    Computes the transaction rate for each entity within a specified time window.

    Args:
        df (pd.DataFrame): The input DataFrame with transaction data.
        timestamp_col (str): The name of the timestamp column.
        entity_col (str): The name of the column that identifies the entity.
        window (str): The rolling time window for calculating the rate.

    Returns:
        pd.DataFrame: The input DataFrame with an added transaction rate column.
    """
    # Create a working copy
    result_df = df.copy()

    # Ensure timestamp is datetime type
    result_df[timestamp_col] = pd.to_datetime(result_df[timestamp_col])

    # Sort by entity and timestamp (critical for rolling window operations)
    result_df = result_df.sort_values(by=[entity_col, timestamp_col])

    # Name of the new rate column
    rate_col_name = f'transaction_rate_{window}'

    # Create a unique row identifier to ensure perfect alignment later
    result_df['_temp_row_id'] = range(len(result_df))

    # APPROACH: Process each entity group separately with explicit time-based calculations
    all_results = []

    for entity, group in result_df.groupby(entity_col):
        #print("Hey Jude")
        # Create a copy of this entity's data
        entity_df = group.copy()
        #print("grouped entity = ")
        #print(entity_df[['cc_num', 'trans_dt']])
        # Create a dummy column for counting
        entity_df['_count'] = 1
        #print("grouped entity(after count) = ")
        #print(entity_df[['cc_num', 'trans_dt', '_count']])
        #print("------------------")
        #sys.exit(1)
        # Set timestamp as index for time-based operations
        entity_df.set_index(timestamp_col, inplace=True)

        # Calculate rolling count within the window
        # For each row, this counts how many transactions happened in the preceding window
        entity_df[rate_col_name] = entity_df['_count'].rolling(window=window, closed='left').sum().fillna(0)

        # Reset index to get timestamp back as a column
        entity_df.reset_index(inplace=True)

        # Keep this group's results
        all_results.append(entity_df)

    # Combine all entity results
    combined_results = pd.concat(all_results, ignore_index=False)
    #print("Combined results = ", combined_results)
    # Sort by our temporary row ID to restore original order
    combined_results = combined_results.sort_values('_temp_row_id')

    # Clean up temporary columns
    combined_results.drop(columns=['_count', '_temp_row_id'], inplace=True)

    return combined_results

In [9]:
def compute_avg_transaction_amount(df, timestamp_col, entity_col, amount_col, window='1H'):
    """
    Computes the average transaction amount for each entity within a specified time window.

    Args:
        df (pd.DataFrame): The input DataFrame with transaction data.
        timestamp_col (str): The name of the timestamp column.
        entity_col (str): The name of the column that identifies the entity.
        amount_col (str): The name of the column containing transaction amounts.
        window (str): The rolling time window for calculating the average.
                      Examples: '1H' (1 hour), '1D' (1 day), '7D' (7 days), '30min'.

    Returns:
        pd.DataFrame: The input DataFrame with an added column representing the
                      average transaction amount per the specified time window.
                      The new column will be named 'avg_amount_' + window.
    """
    # Create a working copy
    result_df = df.copy()

    # Ensure timestamp is datetime type
    result_df[timestamp_col] = pd.to_datetime(result_df[timestamp_col])

    # Sort by entity and timestamp (critical for rolling window operations)
    result_df = result_df.sort_values(by=[entity_col, timestamp_col])

    # Name of the new rate column
    avg_col_name = f'avg_amount_{window}'

    # Create a unique row identifier to ensure perfect alignment later
    result_df['_temp_row_id'] = range(len(result_df))

    # Process each entity group separately
    all_results = []

    for entity, group in result_df.groupby(entity_col):
        # Create a copy of this entity's data
        entity_df = group.copy()

        # Set timestamp as index for time-based operations
        entity_df.set_index(timestamp_col, inplace=True)
        #print("entity_df[amount_col] = ", entity_df[[entity_col, amount_col]])
        """
        # Calculate rolling sum of amounts within the window
        rolling_sum = entity_df[amount_col].rolling(window=window, closed='left').sum()

        # Calculate rolling count of transactions within the window
        rolling_count = entity_df[amount_col].rolling(window=window, closed='left').count()

        # Calculate average amount (handling division by zero)
        # When count is 0, we'll get NaN which we fill with 0 (no transactions means avg amount is 0)
        entity_df[avg_col_name] = (rolling_sum / rolling_count).fillna(0)
        """

        rolling_mean = entity_df[amount_col].rolling(window=window, closed='left').mean().fillna(0)
        entity_df[avg_col_name] = rolling_mean
        # Reset index to get timestamp back as a column
        entity_df.reset_index(inplace=True)

        # Keep this group's results
        all_results.append(entity_df)

    # Combine all entity results
    combined_results = pd.concat(all_results, ignore_index=False)

    # Sort by our temporary row ID to restore original order
    combined_results = combined_results.sort_values('_temp_row_id')

    # Clean up temporary column
    combined_results.drop(columns=['_temp_row_id'], inplace=True)

    return combined_results

In [10]:
def compute_avg_transaction_interval(df, timestamp_col, entity_col, window='30D'):
    """
    Computes the average time interval between transactions for each entity within a specified time window.

    Args:
        df (pd.DataFrame): The input DataFrame with transaction data.
        timestamp_col (str): The name of the timestamp column.
        entity_col (str): The name of the column that identifies the entity (e.g., cc_num).
        window (str): The rolling time window for calculating the average interval.
                      Examples: '30D' (30 days), '1M' (1 month), '90D' (90 days).

    Returns:
        pd.DataFrame: The input DataFrame with an added column representing the
                      average transaction interval in hours within the specified time window.
                      The new column will be named 'avg_interval_hours_' + window.
    """
    # Create a working copy
    result_df = df.copy()

    # Ensure timestamp is datetime type
    result_df[timestamp_col] = pd.to_datetime(result_df[timestamp_col])

    # Sort by entity and timestamp (critical for calculating intervals)
    result_df = result_df.sort_values(by=[entity_col, timestamp_col])

    # Name of the new column
    interval_col_name = f'avg_interval_hours_{window}'

    # Create a unique row identifier to ensure perfect alignment later
    result_df['_temp_row_id'] = range(len(result_df))

    # Process each entity group separately
    all_results = []

    for entity, group in result_df.groupby(entity_col):
        # Create a copy of this entity's data
        entity_df = group.copy()

        # Calculate the time difference between consecutive transactions for this entity
        entity_df['_time_diff'] = entity_df[timestamp_col].diff()

        # Convert time differences to hours
        entity_df['_interval_hours'] = entity_df['_time_diff'].dt.total_seconds() / 3600

        # Set timestamp as index for time-based operations
        entity_df.set_index(timestamp_col, inplace=True)

        # For each transaction, calculate the average interval over the past window
        # We'll use a rolling window to look back at previous intervals

        # First, handle the first transaction for each entity (no interval)
        entity_df.loc[entity_df['_interval_hours'].isna(), '_interval_hours'] = 0

        # Calculate rolling mean of intervals within the specified window
        entity_df[interval_col_name] = entity_df['_interval_hours'].rolling(window=window, closed='left').mean().fillna(0)

        # Reset index to get timestamp back as a column
        entity_df.reset_index(inplace=True)

        # Keep this group's results
        all_results.append(entity_df)

    # Combine all entity results
    combined_results = pd.concat(all_results, ignore_index=False)

    # Sort by our temporary row ID to restore original order
    combined_results = combined_results.sort_values('_temp_row_id')

    # Clean up temporary columns
    combined_results.drop(columns=['_temp_row_id', '_time_diff', '_interval_hours'], inplace=True)

    return combined_results

In [11]:
from xgboost import XGBClassifier

In [12]:
def compute_age(df, curr_dt, dob_dt):
  df['cust_age'] = ((df[curr_dt] - df[dob_dt]).dt.days //365.25).astype(int)
  return df

In [13]:
def getModel(model_nm):
  if model_nm == "logistic":
    model = LogisticRegression(max_iter=200, random_state=42)
    return model
  elif model_nm == "xgb":
    model = XGBClassifier(random_state=42, eval_metric='logloss', use_label_encoder=False)
    """
    model = XGBClassifier(
      objective='binary:logistic',
      num_class="2",
      learning_rate=0.01,
      n_estimators=3
    )
    """
    return model
  else:
    print("Sorry....")

  return model

In [14]:
# Define a function to calculate the Haversine distance between two points
def calculate_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the Haversine distance in kilometers between two points
    specified by latitude and longitude.

    Args:
      lat1, lon1: Latitude and longitude of the first point.
      lat2, lon2: Latitude and longitude of the second point.

    Returns:
      Distance in kilometers.
    """
    # Earth radius in kilometers
    R = 6371

    # Convert latitude and longitude from degrees to radians
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    delta_phi = math.radians(lat2 - lat1)
    delta_lambda = math.radians(lon2 - lon1)

    # Apply the Haversine formula
    a = math.sin(delta_phi / 2.0)**2 + math.cos(phi1) * math.cos(phi2) * math.sin(delta_lambda / 2.0)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [15]:
def data_transformation(data):

  df = data.copy()

  df = transform_datetime(df)
  df = compute_transaction_freq(df, "trans_dt", "cc_num", window='30D')

  df = compute_avg_transaction_amount(df, "trans_dt", "cc_num", 'amt', window='30D')

  df = compute_avg_transaction_interval(df, "trans_dt", "cc_num", window='30D')
  df = compute_age(df, 'trans_dt', 'dob_dt')

  df['distance'] = df.apply(lambda row: calculate_distance(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

  print("Data Columns -----")
  print(df.columns)

  label_encoder = LabelEncoder()

  categorical_cols = ['merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']
  label_encoders = [label_encoder.fit(df[col]) for col in categorical_cols]

  for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

  all_categories = pd.unique(pd.concat([df['category']]))
  category_to_id = {category: i + 1 for i, category in enumerate(all_categories)}
  df['category'] = df['category'].map(category_to_id)

  print("Data transformation complete")
  return df

In [16]:
def handle_categorical_data(df):

  label_encoder = LabelEncoder()

  categorical_cols = ['merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'job']
  label_encoders = [label_encoder.fit(df[col]) for col in categorical_cols]

  for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col].astype(str))

  all_categories = pd.unique(pd.concat([df['category']]))
  category_to_id = {category: i + 1 for i, category in enumerate(all_categories)}
  df['category'] = df['category'].map(category_to_id)

  print("Data transformation complete")
  return df

In [17]:
def split_data(df, split_date):
  df = df.sort_values(by=['trans_date_trans_time','cc_num'])

  df = df.dropna()

  df_train = df[df['trans_date_trans_time'] < split_date]
  df_test = df[df['trans_date_trans_time'] >= split_date]


  y_train = df_train['is_fraud']
  x_train = df_train.drop(columns="is_fraud")

  y_test = df_test['is_fraud']
  x_test = df_test.drop(columns="is_fraud")

  x_train = x_train.drop(["dob", "cc_num", "dob_dt", "trans_num", "trans_dt", "trans_date_trans_time"], axis=1)
  x_test = x_test.drop(["dob", "cc_num", "dob_dt","trans_num", "trans_dt", "trans_date_trans_time"], axis=1)

  return x_train, y_train, x_test, y_test

In [18]:
def normalize_numeric_data(x_train, x_test, scaler_type):
    numerical_features = x_train.select_dtypes(include=['number']).columns.tolist()
      # Initialize the scaler
    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'minmax':
        scaler = MinMaxScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError(f"Invalid scaler_type: {scaler_type}.  Must be 'standard', 'minmax', or 'robust'.")
    # Apply scaling to the numerical features in the training set
    x_train_scaled = x_train.copy() # Create a copy to avoid modifying the original DataFrame
    x_train_scaled[numerical_features] = scaler.fit_transform(x_train[numerical_features])

    # Apply the same scaling to the numerical features in the testing set
    x_test_scaled = x_test.copy() # Create a copy
    x_test_scaled[numerical_features] = scaler.transform(x_test[numerical_features])

    return x_train_scaled, x_test_scaled, numerical_features, scaler

In [19]:
def training_phase(model, x_train, y_train):
  #logreg = LogisticRegression(random_state=42)
  print("Starting training...")
  print("Shape of x_train:", x_train.shape)


  print("Shape of y_train:", y_train.shape)
  print("Shape of y_train:", y_train.dtype)

  print("Shape of y_test:", y_test.shape)
  print("Shape of y_test:", y_test.dtype)

  model.fit(x_train, y_train)
  print("Training complete")
  return model

In [20]:
def testing_phase(model, x_test, y_test):
  y_pred = model.predict(x_test)
  return y_pred

In [21]:
def evaluation(y_pred, y_test):
  print("got predictions:", y_pred)
  print("True predictions:", y_test)
  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred)
  recall = recall_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred)
  report = classification_report(y_test, y_pred)

  return accuracy, precision, recall, f1, report

# Baseline performace

In [22]:
df.columns

Index(['trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [57]:
df = df.drop(['distance'], axis=1)

In [56]:
df.columns

Index(['trans_dt', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'trans_year', 'trans_month',
       'trans_day', 'trans_dayofweek', 'trans_hour', 'dob_dt', 'distance'],
      dtype='object')

In [23]:
df = data_transformation(df)
df.columns
df = handle_categorical_data(df)
split_date = "2020-06-01 00:00:01"
x_train, y_train, x_test, y_test = split_data(df, split_date)

Data Columns -----
Index(['trans_dt', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'trans_year', 'trans_month',
       'trans_day', 'trans_dayofweek', 'trans_hour', 'dob_dt',
       'transaction_rate_30D', 'avg_amount_30D', 'avg_interval_hours_30D',
       'cust_age', 'distance'],
      dtype='object')
Data transformation complete
Data transformation complete


In [26]:
scaler_type = "standard"
x_train_scaled, x_test_scaled, numerical_features, scaler = normalize_numeric_data(x_train, x_test, scaler_type)

In [27]:
model = getModel('xgb')
model = training_phase(model, x_train_scaled, y_train)
y_pred = testing_phase(model, x_test_scaled, y_test)

Starting training...
Shape of x_train: (1238928, 27)
Shape of y_train: (1238928,)
Shape of y_train: int64
Shape of y_test: (613466,)
Shape of y_test: int64


Parameters: { "use_label_encoder" } are not used.



Training complete


In [28]:
df.columns

Index(['trans_dt', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud', 'trans_year', 'trans_month',
       'trans_day', 'trans_dayofweek', 'trans_hour', 'dob_dt',
       'transaction_rate_30D', 'avg_amount_30D', 'avg_interval_hours_30D',
       'cust_age', 'distance'],
      dtype='object')

In [29]:
accuracy, precision, recall, f1, report = evaluation(y_pred, y_test)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", report)

got predictions: [0 0 0 ... 0 0 0]
True predictions: 2438    0
511     0
964     0
1001    0
997     0
       ..
2194    0
3660    0
3650    0
2920    0
3650    0
Name: is_fraud, Length: 613466, dtype: int64
Accuracy: 0.9987790684406308
Precision: 0.9635584137191854
Recall: 0.7252924566357403
F1 Score: 0.8276179516685845
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    610987
           1       0.96      0.73      0.83      2479

    accuracy                           1.00    613466
   macro avg       0.98      0.86      0.91    613466
weighted avg       1.00      1.00      1.00    613466



In [None]:
df.shape

(10000, 22)

# New feature: compute_transaction_rate

In [None]:
def compute_transaction_rate(df, timestamp_col, entity_col, window='1H'):
    """
    Computes the transaction rate for each entity within a specified time window.

    Args:
        df (pd.DataFrame): The input DataFrame with transaction data.
        timestamp_col (str): The name of the timestamp column.
        entity_col (str): The name of the column that identifies the entity
                          (e.g., user_id, account_id, card_id).
        window (str): The rolling time window for calculating the rate.
                      Examples: '1H' (1 hour), '1D' (1 day), '7D' (7 days), '30min'.

    Returns:
        pd.DataFrame: The input DataFrame with an added column representing the
                      transaction rate per the specified window. The new column
                      will be named 'transaction_rate_' + window.
    """

    # Ensure the timestamp column is in datetime format
    df[timestamp_col] = pd.to_datetime(df[timestamp_col])  # Convert to datetime
    # Sort the DataFrame by the entity and then by timestamp
    df = df.sort_values(by=[entity_col, timestamp_col])
    # Calculate the rolling count of transactions within the specified time window
    rate_col_name = f'transaction_rate_{window}'
    # Apply rolling window and calculate mean within each group
    df[rate_col_name] = (
        df.groupby(entity_col)['amt']
        .rolling(window=window, closed='left')
        .mean()
        .reset_index(level=0, drop=True)  # Remove the group index
    )

    return df

In [None]:
new_train_df = compute_transaction_rate(train_df, 'trans_date_trans_time', 'cc_num', window=3)

# Run pipeline again

In [None]:
accuracy, precision, recall, f1, report = normalize_dataset_and_train_model(new_train_df)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Classification Report:\n", report)

Data transformation complete
Training complete
Accuracy: 0.9976744186046511
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       429
           1       0.00      0.00      0.00         1

    accuracy                           1.00       430
   macro avg       0.50      0.50      0.50       430
weighted avg       1.00      1.00      1.00       430



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
