https://www.kaggle.com/datasets/priyamchoksi/credit-card-transactions-dataset

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
import seaborn as sns
import matplotlib as plt
import altair as alt
import plotly.express as px
from pyspark.sql.functions import column
from geopy.geocoders import Nominatim

In [2]:
df_path = r"F:\Datasets\CSV datasets\credit_card_transactions.csv"

In [3]:
spark = SparkSession.builder.appName('Credit_Card_Transactions').getOrCreate()

In [5]:
df = spark.read.csv(df_path, inferSchema=True, header=True)

In [12]:
df.select([
    F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns
]).show()

+----------+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+-------------+
|Unnamed: 0|trans_date_trans_time|cc_num|merchant|category|amt|first|last|gender|street|city|state|zip|lat|long|city_pop|job|dob|trans_num|unix_time|merch_lat|merch_long|is_fraud|merch_zipcode|
+----------+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+-------------+
|         0|                    0|     0|       0|       0|  0|    0|   0|     0|     0|   0|    0|  0|  0|   0|       0|  0|  0|        0|        0|        0|         0|       0|       195973|
+----------+---------------------+------+--------+--------+---+-----+----+------+------+----+-----+---+---+----+--------+---+---+---------+---------+---------+----------+--------+-------------+



In [14]:
df.printSchema()

root
 |-- Unnamed: 0: integer (nullable = true)
 |-- trans_date_trans_time: timestamp (nullable = true)
 |-- cc_num: long (nullable = true)
 |-- merchant: string (nullable = true)
 |-- category: string (nullable = true)
 |-- amt: double (nullable = true)
 |-- first: string (nullable = true)
 |-- last: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- street: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- zip: integer (nullable = true)
 |-- lat: double (nullable = true)
 |-- long: double (nullable = true)
 |-- city_pop: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- dob: date (nullable = true)
 |-- trans_num: string (nullable = true)
 |-- unix_time: integer (nullable = true)
 |-- merch_lat: double (nullable = true)
 |-- merch_long: double (nullable = true)
 |-- is_fraud: integer (nullable = true)
 |-- merch_zipcode: integer (nullable = true)



In [15]:
df.show()

+----------+---------------------+-------------------+--------------------+-------------+------+-----------+---------+------+--------------------+--------------------+-----+-----+-------+------------------+--------+--------------------+----------+--------------------+----------+------------------+------------------+--------+-------------+
|Unnamed: 0|trans_date_trans_time|             cc_num|            merchant|     category|   amt|      first|     last|gender|              street|                city|state|  zip|    lat|              long|city_pop|                 job|       dob|           trans_num| unix_time|         merch_lat|        merch_long|is_fraud|merch_zipcode|
+----------+---------------------+-------------------+--------------------+-------------+------+-----------+---------+------+--------------------+--------------------+-----+-----+-------+------------------+--------+--------------------+----------+--------------------+----------+------------------+------------------+-

In [24]:
df.select(
    F.min(F.col('trans_date_trans_time')).alias('min_date'),
    F.max(F.col('trans_date_trans_time')).alias('max_date'),
).show()

+-------------------+-------------------+
|           min_date|           max_date|
+-------------------+-------------------+
|2019-01-01 00:00:18|2020-06-21 12:13:37|
+-------------------+-------------------+



In [28]:
df.agg(
    F.min("trans_date_trans_time").alias("min_date"),
    F.max("trans_date_trans_time").alias("max_date"),
    (F.unix_timestamp(F.max("trans_date_trans_time")) - F.unix_timestamp(F.min("trans_date_trans_time"))).alias(
        "time_diff_seconds")
).show()

+-------------------+-------------------+-----------------+
|           min_date|           max_date|time_diff_seconds|
+-------------------+-------------------+-----------------+
|2019-01-01 00:00:18|2020-06-21 12:13:37|         46437199|
+-------------------+-------------------+-----------------+

