In [1]:
# Model training - base model
# tracking with mlflow
# Import the specific regression models from scikit-learn
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
import os
from mlflow.models import infer_signature
import pandas as pd
#from urlib.parse import urlparse
import mlflow
#from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from dotenv import load_dotenv
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector, make_column_transformer
import numpy as np
import mlflow.sklearn


In [None]:
load_dotenv()

In [None]:
os.environ['MLFLOW_TRACKING_URI']= os.getenv("MLFLOW_TRACKING_URI")
os.environ['MLFLOW_TRACKING_USERNAME']= os.getenv("MLFLOW_TRACKING_USERNAME")
os.environ["MLFLOW_TRACKING_PASSWORD"]= os.getenv("MLFLOW_TRACKING_PASSWORD")

In [3]:
df = pd.read_csv("../data/processed/preprocessed_transactions.csv")
df.head()

Unnamed: 0,transaction_id,customer_id,timestamp,amount,merchant_category,merchant_id,device_id,location,is_fraud,hour_of_day,day_of_week,is_night,is_weekend,amount_scaled
0,159943,4575,2023-01-01 00:10:00,35747.94,7,7906,5,2,0,0,6,1,1,-0.580246
1,887,25,2023-01-01 00:10:00,41011.8,7,3162,4,0,1,0,6,1,1,-0.457207
2,2381,68,2023-01-01 00:21:00,97614.54,2,8815,2,5,0,0,6,1,1,0.865847
3,121485,3455,2023-01-01 00:36:00,58350.27,5,8060,0,3,0,0,6,1,1,-0.051931
4,88703,2519,2023-01-01 00:51:00,93539.79,7,5597,0,4,0,0,6,1,1,0.770603


In [6]:
df_2 = pd.read_csv("../data/raw/bank_transactions.csv")
df_2.head()

Unnamed: 0,transaction_id,customer_id,timestamp,amount,merchant_category,merchant_id,device_id,location,is_fraud
0,159943,4575,2023-01-01 00:10:00,35747.94,Restaurants,M8906,iPhone_12,Ibadan,0
1,887,25,2023-01-01 00:10:00,41011.8,Restaurants,M4162,Windows_PC,Abuja,1
2,2381,68,2023-01-01 00:21:00,97614.54,Fashion,M9815,Samsung_S21,Lagos,0
3,121485,3455,2023-01-01 00:36:00,58350.27,Health,M9060,Infinix_Hot,Kaduna,0
4,88703,2519,2023-01-01 00:51:00,93539.79,Restaurants,M6597,Infinix_Hot,Kano,0


In [7]:
df_2.columns

Index(['transaction_id', 'customer_id', 'timestamp', 'amount',
       'merchant_category', 'merchant_id', 'device_id', 'location',
       'is_fraud'],
      dtype='object')

In [4]:
df.columns

Index(['transaction_id', 'customer_id', 'timestamp', 'amount',
       'merchant_category', 'merchant_id', 'device_id', 'location', 'is_fraud',
       'hour_of_day', 'day_of_week', 'is_night', 'is_weekend',
       'amount_scaled'],
      dtype='object')

In [5]:
df.dtypes

transaction_id         int64
customer_id            int64
timestamp             object
amount               float64
merchant_category      int64
merchant_id            int64
device_id              int64
location               int64
is_fraud               int64
hour_of_day            int64
day_of_week            int64
is_night               int64
is_weekend             int64
amount_scaled        float64
dtype: object

In [None]:
# Define features
numeric_features = [
    'amount', 'amount_scaled', 'customer_avg_amount',
    'customer_std_amount', 'transaction_frequency',
    'distance_from_home', 'transaction_hour'
]

categorical_features = [
    'device_type', 'location', 'merchant_category', 'channel'
]

In [None]:
# Preprocessing pipeline 

# defining numerical and categorical columns

# preprocessing pipeline the numerical features that is all the features in the dataset
# defining pipeline
num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                     StandardScaler())
# pipeline for  the log transformation to handle skew features
log_pipeline = make_pipeline(
    SimpleImputer(strategy="median"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())



# building the preprocessing pipeline
preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (log_pipeline, make_column_selector(dtype_include=np.number)),
)