The following code is to create environment folder which will contain the kaggle credentials  

In [1]:
import os
os.makedirs('/root/.kaggle', exist_ok=True)

with open('/root/.kaggle/kaggle.json', 'w') as f:
    f.write('{"username":"aalharabi","key":"cc9637f8cc673ab18278a1c9d7c15834"}')

Then we will download the dataset from kaggle

In [2]:
!kaggle datasets download -d kartik2112/fraud-detection

Downloading fraud-detection.zip to /content
 97% 195M/202M [00:00<00:00, 208MB/s]
100% 202M/202M [00:01<00:00, 209MB/s]


Unzipping the dataset

In [3]:
!unzip fraud-detection

Archive:  fraud-detection.zip
  inflating: fraudTest.csv           
  inflating: fraudTrain.csv          


Installing Java

In [4]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.18" 2023-01-17
OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1)
OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu120.04.1, mixed mode, sharing)


Installing spark

In [5]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.4.0.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.0-py2.py3-none-any.whl size=311317145 sha256=8eb0efb4a11a4e774a0d7324451d8285f9c1cbe77f76f4dfb8d336942650a6bf
  Stored in directory: /root/.cache/pip/wheels/7b/1b/4b/3363a1d04368e7ff0d408e57ff57966fcdf00583774e761327
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.0


Preprocessing the data 

Since the dataset is already splited into train and test, we will combine them in one dataset then we will split them into 80% train, and 20% test

In [9]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import rand
from pyspark.sql.functions import isnan, when, count, col
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, VectorAssembler, MinMaxScaler

def preprocessing():
  spark = SparkSession.builder.appName('MyApp') \
          .config('spark.ui.port', '4050') \
          .config("spark.driver.memory", "12g") \
          .getOrCreate()

  spark.sparkContext.setLogLevel("ERROR")

  train_csv = spark.read.csv('fraudTrain.csv', header=True)
  test_csv = spark.read.csv('fraudTest.csv', header=True)

  df = train_csv.union(test_csv)
  df = df.withColumn('amt', col('amt').cast('double'))
  df = df.withColumn('lat', col('lat').cast('double'))
  df = df.withColumn('long', col('long').cast('double'))
  df = df.withColumn('city_pop', col('city_pop').cast('double'))

  # Create a StringIndexer object
  category_indexer = StringIndexer(inputCol="category", outputCol="category_index")
  gender_indexer = StringIndexer(inputCol="gender", outputCol="gender_index")

  # Fit the StringIndexer to the data
  df = category_indexer.fit(df).transform(df)
  df = gender_indexer.fit(df).transform(df)

  # Use VectorAssembler to create the "features" column
  input_cols = ['amt', 'lat', 'long', 'city_pop', 'category_index', 'gender_index']
  assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
  df = assembler.transform(df)

  # Assuming "data" is a Spark dataframe with a column named "features"
  scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
  scalerModel = scaler.fit(df)
  df = scalerModel.transform(df)

  # Split the data into 80% train and 20% test
  train_df, test_df = df.randomSplit([0.8, 0.2], seed=42)

  # Drop rows which have NaN and null data
  train_df = train_df.dropna()
  test_df = test_df.dropna()

  # Convert the 'is_fraud' column to integer
  train_df = train_df.withColumn('label', col('is_fraud').cast('int'))
  test_df = test_df.withColumn('label', col('is_fraud').cast('int'))

  # Selecting only the relevant columns for training and testing
  selected_cols = ['label','features', 'scaledFeatures']
  train_df = train_df.select(selected_cols)
  test_df = test_df.select(selected_cols)

  return train_df, test_df

train_df, test_df = preprocessing()

Now we will build the Logistic Regression function then we will feed it with the train and test datasets

In [10]:
def logistic_regression(train_data, test_data):
  # Create a Logistic Regression model
  lr = LogisticRegression(featuresCol='features', labelCol='label')

  # Train the model on the training data
  lr_model = lr.fit(train_data)

  # Make predictions on the test data
  predictions = lr_model.transform(test_data)

  # Evaluate the model
  evaluator = BinaryClassificationEvaluator(labelCol='label')
  auc = evaluator.evaluate(predictions)

  return lr_model, predictions, auc

lr_model, lr_predictions, lr_auc = logistic_regression(train_df, test_df)

print("Area Under ROC Curve (AUC) on test data using Logistic Regression = %g" % lr_auc)

Area Under ROC Curve (AUC) on test data using Logistic Regression = 0.852991


Now we will build the Naive Bayes function then we will feed it with the train and test datasets

In [11]:
def naive_bayes(train_data, test_data):
    # Train a Naive Bayes model
    nb = NaiveBayes(smoothing=1.0, modelType="multinomial", featuresCol='scaledFeatures')
    nb_model = nb.fit(train_data)

    # Make predictions on test data
    predictions = nb_model.transform(test_data)

    # Evaluate the model using accuracy
    evaluator = BinaryClassificationEvaluator(labelCol="label")
    auc = evaluator.evaluate(predictions)

    return nb_model, predictions, auc

nb_model, nb_predictions, nb_auc = naive_bayes(train_df, test_df)

print("Area Under ROC Curve (AUC) on test data using Naive Bayes = %g" % nb_auc)

Area Under ROC Curve (AUC) on test data using Naive Bayes = 0.487739


We can conclude that Logistic Regression Fits our data more than the Naive Bayes