In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import polars as pl
import polars.selectors as cs
import altair as alt
import plotly.express as px
import plotly.graph_objects as go
import great_tables as tg
import datetime as dt
import numpy as np
from decouple import Config, RepositoryEnv

In [3]:
df_path = r"F:\Datasets\CSV datasets\credit_card_transactions.csv"

In [4]:
df = pl.read_csv(df_path, try_parse_dates=True, infer_schema=True, batch_size=50_000)

In [5]:
df = df.rename({
    'Unnamed: 0': 'user_id',
    'cc_num': 'credit_card_number',
    'merchant': 'merchant_name',
    'category': 'transaction_category',
    'amt': 'transaction_amount',
    'first': 'first_name',
    'last': 'last_name',
    'street': 'street_address',
    'zip': 'zip_code',
    'lat': 'latitude',
    'long': 'longitude',
    'city_pop': 'city_population',
    'dob': 'date_of_birth',
    'trans_num': 'transaction_number',
    'unix_time': 'unix_timestamp',
    'merch_lat': 'merchant_latitude',
    'merch_long': 'merchant_longitude'
})

In [6]:
df.null_count()

user_id,trans_date_trans_time,credit_card_number,merchant_name,transaction_category,transaction_amount,first_name,last_name,gender,street_address,city,state,zip_code,latitude,longitude,city_population,job,date_of_birth,transaction_number,unix_timestamp,merchant_latitude,merchant_longitude,is_fraud,merch_zipcode
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,195973


In [7]:
df['merch_zipcode'].drop_nulls().count()

1100702

In [8]:
df['merch_zipcode'].null_count()

195973

In [9]:
df = df.with_columns(
    pl.col('merch_zipcode').fill_null(-1)
)

In [10]:
df

user_id,trans_date_trans_time,credit_card_number,merchant_name,transaction_category,transaction_amount,first_name,last_name,gender,street_address,city,state,zip_code,latitude,longitude,city_population,job,date_of_birth,transaction_number,unix_timestamp,merchant_latitude,merchant_longitude,is_fraud,merch_zipcode
i64,datetime[μs],i64,str,str,f64,str,str,str,str,str,str,i64,f64,f64,i64,str,date,str,i64,f64,f64,i64,i64
0,2019-01-01 00:00:18,2703186189652095,"""fraud_Rippin, Kub and Mann""","""misc_net""",4.97,"""Jennifer""","""Banks""","""F""","""561 Perry Cove""","""Moravian Falls""","""NC""",28654,36.0788,-81.1781,3495,"""Psychologist, counselling""",1988-03-09,"""0b242abb623afc578575680df30655…",1325376018,36.011293,-82.048315,0,28705
1,2019-01-01 00:00:44,630423337322,"""fraud_Heller, Gutmann and Ziem…","""grocery_pos""",107.23,"""Stephanie""","""Gill""","""F""","""43039 Riley Greens Suite 393""","""Orient""","""WA""",99160,48.8878,-118.2105,149,"""Special educational needs teac…",1978-06-21,"""1f76529f8574734946361c461b024d…",1325376044,49.159047,-118.186462,0,-1
2,2019-01-01 00:00:51,38859492057661,"""fraud_Lind-Buckridge""","""entertainment""",220.11,"""Edward""","""Sanchez""","""M""","""594 White Dale Suite 530""","""Malad City""","""ID""",83252,42.1808,-112.262,4154,"""Nature conservation officer""",1962-01-19,"""a1a22d70485983eac12b5b88dad1cf…",1325376051,43.150704,-112.154481,0,83236
3,2019-01-01 00:01:16,3534093764340240,"""fraud_Kutch, Hermiston and Far…","""gas_transport""",45.0,"""Jeremy""","""White""","""M""","""9443 Cynthia Court Apt. 038""","""Boulder""","""MT""",59632,46.2306,-112.1138,1939,"""Patent attorney""",1967-01-12,"""6b849c168bdad6f867558c3793159a…",1325376076,47.034331,-112.561071,0,-1
4,2019-01-01 00:03:06,375534208663984,"""fraud_Keeling-Crist""","""misc_pos""",41.96,"""Tyler""","""Garcia""","""M""","""408 Bradley Rest""","""Doe Hill""","""VA""",24433,38.4207,-79.4629,99,"""Dance movement psychotherapist""",1986-03-28,"""a41d7549acf90789359a9aa5346dcb…",1325376186,38.674999,-78.632459,0,22844
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1296670,2020-06-21 12:12:08,30263540414123,"""fraud_Reichel Inc""","""entertainment""",15.56,"""Erik""","""Patterson""","""M""","""162 Jessica Row Apt. 072""","""Hatch""","""UT""",84735,37.7175,-112.4777,258,"""Geoscientist""",1961-11-24,"""440b587732da4dc1a6395aba5fb416…",1371816728,36.841266,-111.690765,0,-1
1296671,2020-06-21 12:12:19,6011149206456997,"""fraud_Abernathy and Sons""","""food_dining""",51.7,"""Jeffrey""","""White""","""M""","""8617 Holmes Terrace Suite 651""","""Tuscarora""","""MD""",21790,39.2667,-77.5101,100,"""Production assistant, televisi…",1979-12-11,"""278000d2e0d2277d1de2f890067dcc…",1371816739,38.906881,-78.246528,0,22630
1296672,2020-06-21 12:12:32,3514865930894695,"""fraud_Stiedemann Ltd""","""food_dining""",105.93,"""Christopher""","""Castaneda""","""M""","""1632 Cohen Drive Suite 639""","""High Rolls Mountain Park""","""NM""",88325,32.9396,-105.8189,899,"""Naval architect""",1967-08-30,"""483f52fe67fabef353d552c1e66297…",1371816752,33.619513,-105.130529,0,88351
1296673,2020-06-21 12:13:36,2720012583106919,"""fraud_Reinger, Weissnat and St…","""food_dining""",74.9,"""Joseph""","""Murray""","""M""","""42933 Ryan Underpass""","""Manderson""","""SD""",57756,43.3526,-102.5411,1126,"""Volunteer coordinator""",1980-08-18,"""d667cdcbadaaed3da3f4020e83591c…",1371816816,42.78894,-103.24116,0,69367


In [11]:
env_path = "F:/DataSpell/scikit-learn_training/.env"
config = Config(RepositoryEnv(env_path))
mapbox_key = config("MAPBOX_API_KEY")
px.set_mapbox_access_token(mapbox_key)

In [12]:
limited_df = df.head(50000)

fig = px.scatter_mapbox(
    limited_df,
    lat='latitude',
    lon='longitude',
    color='transaction_category',
    zoom=10,
    height=600
)

fig.write_html(r'F:\DataSpell\scikit-learn_training\credit_card_fraud_detection\map.html')

In [13]:
transaction_timestamp = df.select(
    pl.col('trans_date_trans_time').dt.year().alias('transaction_year'),
    pl.col('trans_date_trans_time').dt.month().alias('transaction_month'),
    pl.col('trans_date_trans_time').dt.day().alias('transaction_day'),
    pl.col('trans_date_trans_time').dt.hour().alias('transaction_hour'),
    pl.col('trans_date_trans_time').dt.minute().alias('transaction_minute'),
    pl.col('trans_date_trans_time').dt.second().alias('transaction_second'),
)

df.insert_column(2, transaction_timestamp['transaction_year'])
df.insert_column(3, transaction_timestamp['transaction_month'])
df.insert_column(4, transaction_timestamp['transaction_day'])
df.insert_column(5, transaction_timestamp['transaction_hour'])
df.insert_column(6, transaction_timestamp['transaction_minute'])
df.insert_column(7, transaction_timestamp['transaction_second'])

user_id,trans_date_trans_time,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_minute,transaction_second,credit_card_number,merchant_name,transaction_category,transaction_amount,first_name,last_name,gender,street_address,city,state,zip_code,latitude,longitude,city_population,job,date_of_birth,transaction_number,unix_timestamp,merchant_latitude,merchant_longitude,is_fraud,merch_zipcode
i64,datetime[μs],i32,i8,i8,i8,i8,i8,i64,str,str,f64,str,str,str,str,str,str,i64,f64,f64,i64,str,date,str,i64,f64,f64,i64,i64
0,2019-01-01 00:00:18,2019,1,1,0,0,18,2703186189652095,"""fraud_Rippin, Kub and Mann""","""misc_net""",4.97,"""Jennifer""","""Banks""","""F""","""561 Perry Cove""","""Moravian Falls""","""NC""",28654,36.0788,-81.1781,3495,"""Psychologist, counselling""",1988-03-09,"""0b242abb623afc578575680df30655…",1325376018,36.011293,-82.048315,0,28705
1,2019-01-01 00:00:44,2019,1,1,0,0,44,630423337322,"""fraud_Heller, Gutmann and Ziem…","""grocery_pos""",107.23,"""Stephanie""","""Gill""","""F""","""43039 Riley Greens Suite 393""","""Orient""","""WA""",99160,48.8878,-118.2105,149,"""Special educational needs teac…",1978-06-21,"""1f76529f8574734946361c461b024d…",1325376044,49.159047,-118.186462,0,-1
2,2019-01-01 00:00:51,2019,1,1,0,0,51,38859492057661,"""fraud_Lind-Buckridge""","""entertainment""",220.11,"""Edward""","""Sanchez""","""M""","""594 White Dale Suite 530""","""Malad City""","""ID""",83252,42.1808,-112.262,4154,"""Nature conservation officer""",1962-01-19,"""a1a22d70485983eac12b5b88dad1cf…",1325376051,43.150704,-112.154481,0,83236
3,2019-01-01 00:01:16,2019,1,1,0,1,16,3534093764340240,"""fraud_Kutch, Hermiston and Far…","""gas_transport""",45.0,"""Jeremy""","""White""","""M""","""9443 Cynthia Court Apt. 038""","""Boulder""","""MT""",59632,46.2306,-112.1138,1939,"""Patent attorney""",1967-01-12,"""6b849c168bdad6f867558c3793159a…",1325376076,47.034331,-112.561071,0,-1
4,2019-01-01 00:03:06,2019,1,1,0,3,6,375534208663984,"""fraud_Keeling-Crist""","""misc_pos""",41.96,"""Tyler""","""Garcia""","""M""","""408 Bradley Rest""","""Doe Hill""","""VA""",24433,38.4207,-79.4629,99,"""Dance movement psychotherapist""",1986-03-28,"""a41d7549acf90789359a9aa5346dcb…",1325376186,38.674999,-78.632459,0,22844
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1296670,2020-06-21 12:12:08,2020,6,21,12,12,8,30263540414123,"""fraud_Reichel Inc""","""entertainment""",15.56,"""Erik""","""Patterson""","""M""","""162 Jessica Row Apt. 072""","""Hatch""","""UT""",84735,37.7175,-112.4777,258,"""Geoscientist""",1961-11-24,"""440b587732da4dc1a6395aba5fb416…",1371816728,36.841266,-111.690765,0,-1
1296671,2020-06-21 12:12:19,2020,6,21,12,12,19,6011149206456997,"""fraud_Abernathy and Sons""","""food_dining""",51.7,"""Jeffrey""","""White""","""M""","""8617 Holmes Terrace Suite 651""","""Tuscarora""","""MD""",21790,39.2667,-77.5101,100,"""Production assistant, televisi…",1979-12-11,"""278000d2e0d2277d1de2f890067dcc…",1371816739,38.906881,-78.246528,0,22630
1296672,2020-06-21 12:12:32,2020,6,21,12,12,32,3514865930894695,"""fraud_Stiedemann Ltd""","""food_dining""",105.93,"""Christopher""","""Castaneda""","""M""","""1632 Cohen Drive Suite 639""","""High Rolls Mountain Park""","""NM""",88325,32.9396,-105.8189,899,"""Naval architect""",1967-08-30,"""483f52fe67fabef353d552c1e66297…",1371816752,33.619513,-105.130529,0,88351
1296673,2020-06-21 12:13:36,2020,6,21,12,13,36,2720012583106919,"""fraud_Reinger, Weissnat and St…","""food_dining""",74.9,"""Joseph""","""Murray""","""M""","""42933 Ryan Underpass""","""Manderson""","""SD""",57756,43.3526,-102.5411,1126,"""Volunteer coordinator""",1980-08-18,"""d667cdcbadaaed3da3f4020e83591c…",1371816816,42.78894,-103.24116,0,69367


In [14]:
{index: name for index, name in enumerate(df.columns)}

{0: 'user_id',
 1: 'trans_date_trans_time',
 2: 'transaction_year',
 3: 'transaction_month',
 4: 'transaction_day',
 5: 'transaction_hour',
 6: 'transaction_minute',
 7: 'transaction_second',
 8: 'credit_card_number',
 9: 'merchant_name',
 10: 'transaction_category',
 11: 'transaction_amount',
 12: 'first_name',
 13: 'last_name',
 14: 'gender',
 15: 'street_address',
 16: 'city',
 17: 'state',
 18: 'zip_code',
 19: 'latitude',
 20: 'longitude',
 21: 'city_population',
 22: 'job',
 23: 'date_of_birth',
 24: 'transaction_number',
 25: 'unix_timestamp',
 26: 'merchant_latitude',
 27: 'merchant_longitude',
 28: 'is_fraud',
 29: 'merch_zipcode'}

In [15]:
birth_date = df.select(
    pl.col('date_of_birth').dt.year().alias('year_of_birth'),
    pl.col('date_of_birth').dt.month().alias('month_of_birth'),
    pl.col('date_of_birth').dt.day().alias('day_of_birth')
)

df.insert_column(24, birth_date['year_of_birth'])
df.insert_column(25, birth_date['month_of_birth'])
df.insert_column(26, birth_date['day_of_birth'])

user_id,trans_date_trans_time,transaction_year,transaction_month,transaction_day,transaction_hour,transaction_minute,transaction_second,credit_card_number,merchant_name,transaction_category,transaction_amount,first_name,last_name,gender,street_address,city,state,zip_code,latitude,longitude,city_population,job,date_of_birth,year_of_birth,month_of_birth,day_of_birth,transaction_number,unix_timestamp,merchant_latitude,merchant_longitude,is_fraud,merch_zipcode
i64,datetime[μs],i32,i8,i8,i8,i8,i8,i64,str,str,f64,str,str,str,str,str,str,i64,f64,f64,i64,str,date,i32,i8,i8,str,i64,f64,f64,i64,i64
0,2019-01-01 00:00:18,2019,1,1,0,0,18,2703186189652095,"""fraud_Rippin, Kub and Mann""","""misc_net""",4.97,"""Jennifer""","""Banks""","""F""","""561 Perry Cove""","""Moravian Falls""","""NC""",28654,36.0788,-81.1781,3495,"""Psychologist, counselling""",1988-03-09,1988,3,9,"""0b242abb623afc578575680df30655…",1325376018,36.011293,-82.048315,0,28705
1,2019-01-01 00:00:44,2019,1,1,0,0,44,630423337322,"""fraud_Heller, Gutmann and Ziem…","""grocery_pos""",107.23,"""Stephanie""","""Gill""","""F""","""43039 Riley Greens Suite 393""","""Orient""","""WA""",99160,48.8878,-118.2105,149,"""Special educational needs teac…",1978-06-21,1978,6,21,"""1f76529f8574734946361c461b024d…",1325376044,49.159047,-118.186462,0,-1
2,2019-01-01 00:00:51,2019,1,1,0,0,51,38859492057661,"""fraud_Lind-Buckridge""","""entertainment""",220.11,"""Edward""","""Sanchez""","""M""","""594 White Dale Suite 530""","""Malad City""","""ID""",83252,42.1808,-112.262,4154,"""Nature conservation officer""",1962-01-19,1962,1,19,"""a1a22d70485983eac12b5b88dad1cf…",1325376051,43.150704,-112.154481,0,83236
3,2019-01-01 00:01:16,2019,1,1,0,1,16,3534093764340240,"""fraud_Kutch, Hermiston and Far…","""gas_transport""",45.0,"""Jeremy""","""White""","""M""","""9443 Cynthia Court Apt. 038""","""Boulder""","""MT""",59632,46.2306,-112.1138,1939,"""Patent attorney""",1967-01-12,1967,1,12,"""6b849c168bdad6f867558c3793159a…",1325376076,47.034331,-112.561071,0,-1
4,2019-01-01 00:03:06,2019,1,1,0,3,6,375534208663984,"""fraud_Keeling-Crist""","""misc_pos""",41.96,"""Tyler""","""Garcia""","""M""","""408 Bradley Rest""","""Doe Hill""","""VA""",24433,38.4207,-79.4629,99,"""Dance movement psychotherapist""",1986-03-28,1986,3,28,"""a41d7549acf90789359a9aa5346dcb…",1325376186,38.674999,-78.632459,0,22844
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1296670,2020-06-21 12:12:08,2020,6,21,12,12,8,30263540414123,"""fraud_Reichel Inc""","""entertainment""",15.56,"""Erik""","""Patterson""","""M""","""162 Jessica Row Apt. 072""","""Hatch""","""UT""",84735,37.7175,-112.4777,258,"""Geoscientist""",1961-11-24,1961,11,24,"""440b587732da4dc1a6395aba5fb416…",1371816728,36.841266,-111.690765,0,-1
1296671,2020-06-21 12:12:19,2020,6,21,12,12,19,6011149206456997,"""fraud_Abernathy and Sons""","""food_dining""",51.7,"""Jeffrey""","""White""","""M""","""8617 Holmes Terrace Suite 651""","""Tuscarora""","""MD""",21790,39.2667,-77.5101,100,"""Production assistant, televisi…",1979-12-11,1979,12,11,"""278000d2e0d2277d1de2f890067dcc…",1371816739,38.906881,-78.246528,0,22630
1296672,2020-06-21 12:12:32,2020,6,21,12,12,32,3514865930894695,"""fraud_Stiedemann Ltd""","""food_dining""",105.93,"""Christopher""","""Castaneda""","""M""","""1632 Cohen Drive Suite 639""","""High Rolls Mountain Park""","""NM""",88325,32.9396,-105.8189,899,"""Naval architect""",1967-08-30,1967,8,30,"""483f52fe67fabef353d552c1e66297…",1371816752,33.619513,-105.130529,0,88351
1296673,2020-06-21 12:13:36,2020,6,21,12,13,36,2720012583106919,"""fraud_Reinger, Weissnat and St…","""food_dining""",74.9,"""Joseph""","""Murray""","""M""","""42933 Ryan Underpass""","""Manderson""","""SD""",57756,43.3526,-102.5411,1126,"""Volunteer coordinator""",1980-08-18,1980,8,18,"""d667cdcbadaaed3da3f4020e83591c…",1371816816,42.78894,-103.24116,0,69367


In [16]:
X = df.select(cs.exclude(['user_id', 'unix_timestamp', 'trans_date_trans_time', 'date_of_birth', 'is_fraud', 'first_name', 'last_name']))
y = df.get_column('is_fraud')

In [21]:
# from sklearn.base import BaseEstimator, TransformerMixin
# from sklearn.pipeline import Pipeline
# from sklearn.feature_extraction import FeatureHasher
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# import category_encoders as ce

In [22]:
X.head(1)

transaction_year,transaction_month,transaction_day,transaction_hour,transaction_minute,transaction_second,credit_card_number,merchant_name,transaction_category,transaction_amount,gender,street_address,city,state,zip_code,latitude,longitude,city_population,job,year_of_birth,month_of_birth,day_of_birth,transaction_number,merchant_latitude,merchant_longitude,merch_zipcode
i32,i8,i8,i8,i8,i8,i64,str,str,f64,str,str,str,str,i64,f64,f64,i64,str,i32,i8,i8,str,f64,f64,i64
2019,1,1,0,0,18,2703186189652095,"""fraud_Rippin, Kub and Mann""","""misc_net""",4.97,"""F""","""561 Perry Cove""","""Moravian Falls""","""NC""",28654,36.0788,-81.1781,3495,"""Psychologist, counselling""",1988,3,9,"""0b242abb623afc578575680df30655…",36.011293,-82.048315,28705


In [23]:
# class MultiColumnFeatureHasher(BaseEstimator, TransformerMixin):
#
#     def __init__(self, columns, n_features=10):
#
#         self.columns = columns
#         self.n_features = n_features
#         self.hasher = FeatureHasher(n_features=self.n_features, input_type='dict')
#
#     def fit(self, X, y=None):
#         return self
#
#     def transform(self, X):
#         dict_list = []
#         for _, row in X.iterrows():
#             row_dict = {f"{col}={row[col]}": 1 for col in self.columns}
#             dict_list.append(row_dict)
#
#         return self.hasher.transform(dict_list)

In [25]:
# columns_to_encode = ['merchant_name', 'transaction_category', 'gender',
#                      'street_address', 'city', 'state', 'job', 'transaction_number']
#
# high_card_columns = ['merchant_name', 'transaction_category']
# low_card_columns = ['street_address', 'city', 'state', 'job', 'transaction_number']
#
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('hash_enc', MultiColumnFeatureHasher(columns=high_card_columns, n_features=10), high_card_columns),
#         ('onehot', OneHotEncoder(sparse_output=False, drop='first'), low_card_columns)
#     ],
#     remainder='passthrough'
# )


In [56]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler

In [57]:
categorical_columns = X.select(~cs.numeric())
numerical_columns = X.select(cs.numeric())

In [58]:
preprocesing = ColumnTransformer(
    transformers=[
        ('categorical', OrdinalEncoder(), categorical_columns),
        ('numerical', StandardScaler(), numerical_columns)
    ]
)

In [59]:
pipeline = Pipeline(steps=[('preprocessor', preprocesing)])

In [61]:
X_encoded = pipeline.fit_transform(X)

ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed