Fraud Detection and Prediction - Quantum
==========================================

***Quantum Models Used***
* VQC (Variation Quantum Classifier)
* 


**Author:** *Bipul Sinha*

In [None]:
! pip install --upgrade imblearn qiskit-machine-learning qiskit-aer pylatexenc

In [None]:
import warnings 
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

import time

# 1. Data Exploration

In [None]:
from load_dataset import read_csv_file
df = read_csv_file('creditcardfraud.zip') # provide name of the zip file instead of csv file
df.describe().T # Data Summary

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
import seaborn as sns
plt.figure(figsize=(20, 10))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cbar=True, cmap='viridis')
plt.show()

## 1a. Understanding and evaluating data
* Since, we are unable to see data from other than Class 0, we need data evaluation.
* Plus, the number of instances of fraudelent data are less in numbers we need to pump-up the fraudelent data.

In [None]:
import pylab
pylab.rcParams['figure.figsize'] = (8, 5)

# Target distribution
print('Target distribution "1" & "0" in column "Class", legal and fraudulent transactions, respectively, pieces')
target_count = df['Class'].value_counts()
print('0:', target_count[0])
print('1:', target_count[1])
print('Imbalance degree:', '1:', round(target_count[0] / target_count[1], 2))
print (' ')
print('Target distribution "1" & "0" in column "Class", legal and fraudulent transactions, respectively, %')
print((df.groupby('Class')['Class'].count()/df['Class'].count())*100)


pylab.rcParams['figure.figsize'] = (4, 3)
target_count.plot(kind='pie', title='Distribution of target variable', legend="true");

In [None]:
df.hist(figsize=(25,20))
plt.show()

In [None]:
import numpy as np

features = list(df.columns)  # Assuming "Class" is not a feature
correlations = np.abs(df.corr())  # Get absolute correlations
strong_correlations = np.where(correlations > 0.7)  # Adjust threshold as needed
feature_pairs = [(features[i], features[j]) for i, j in zip(*strong_correlations)]



for x, y in feature_pairs:
    plt.figure(figsize=(3,3))
    #plt.subplots(2,2,figsize=(3,3) )
    sns.scatterplot(x=x, y=y, hue="Class", data=df, palette="tab10")
    plt.show()


# 2. Data Cleansing
* Since, the number of instances of 0 is 284315 and for Class 1 it is mere 492, we need to resolve data imbalance. This could be done by either Over-Sampling or Under-Sampling
* But prior to that, we need to figure out instances of duplicates and clean the data from such instances

## Removing Duplicates

In [None]:
df.duplicated().any
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

# 3. Solving Data Imbalance Problem

### 3a. Columnar Imbalance
- Normalization/Standardization - We can opt for either of them for Time and Amount column. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

amount_data = df['Amount'].values

# Plot histogram to visualize the distribution
plt.figure(figsize=(10, 6))
sns.histplot(amount_data, bins=30, kde=True, color='blue')

# Fit a normal distribution to the data
mu, sigma = stats.norm.fit(amount_data)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, sigma)
plt.plot(x, p, 'k', linewidth=2)

# Add labels and title
plt.title("Amount Distribution")
plt.xlabel("Amount")
plt.ylabel("Density")

# Show plot
plt.show()

# Plot a Q-Q plot to compare against a theoretical normal distribution
plt.figure(figsize=(10, 6))
stats.probplot(amount_data, dist="norm", plot=plt)
plt.title("Q-Q plot of Amount")
plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

time_data = df['Time'].values

# Plot histogram to visualize the distribution
plt.figure(figsize=(10, 6))
sns.histplot(amount_data, bins=30, kde=True, color='blue')

# Fit a normal distribution to the data
mu, sigma = stats.norm.fit(time_data)
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = stats.norm.pdf(x, mu, sigma)
plt.plot(x, p, 'k', linewidth=2)

# Add labels and title
plt.title("Time Distribution")
plt.xlabel("Time ")
plt.ylabel("Density")

# Show plot
plt.show()

# Plot a Q-Q plot to compare against a theoretical normal distribution
plt.figure(figsize=(8, 6))
stats.probplot(time_data, dist="norm", plot=plt)
plt.title("Q-Q plot of Time")
plt.show()


### Normalization 
*As we can see that both Time and Amount does not have a Gaussian Distribution(Bell Curve) it will be good to perform Normalization on these fields*

Steps:
1. Get X-axis and Y-axis data
2. Train-Test Split
3. Sampling
4. Feature Scaling

#### i) Get X-Axis and Y-Axis data

In [None]:
# Taking columns v1 to v28 plus amount and remocing Class
#x= df.iloc[:,1:29] # Remove .values to view data in tabular structure
x= df.iloc[:,:-1].values
x

In [None]:
# Considering class as Y attribute
y = df.iloc[:,-1].values
y

In [None]:
''' 
Zero padding is to make the number of features equal to a power of 2.
it is required for 'amplitude encoding' given below.
'''
num_examples, num_features = x.shape
#print(x_train)
print(num_examples)
print(num_features)

print("Log 2 = ", np.log2(num_features))
print("Ceil = ", np.ceil(np.log2(num_features))) 
n = int(np.ceil(np.log2(num_features)))
dim = 2**n
print(f'(number of qubits, dimension of the Hilbert space) = {(n, dim)}')

zeros = np.zeros((num_examples, dim-num_features))
print("Zeroes = ", len(zeros))
X = np.append(x, zeros, axis=1)
print("X = ", X)
num_examples, num_features = X.shape

num_examples, num_features = X.shape   
print("number of examples = ", num_examples)
print("number of features = ", num_features)

#### ii) Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test =  train_test_split(X,y, test_size = 0.2, random_state=42)

print("X-train", len(x_train))
print("X-test", len(x_test))
print("Y-train", len(y_train))
print("Y-test", len(y_test))

labels = ['Class 0', 'Class 1']

#### iii) Sampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Create the undersampler object
rus = RandomUnderSampler(random_state=1)  # Set a random state for reproducibility

# Fit and apply the undersampling to your data
x_train_resampled, y_train_resampled = rus.fit_resample(x_train, y_train)

x_test_resampled, y_test_resampled = rus.fit_resample(x_test, y_test)

# Print the resampled data
print(len(x_train_resampled))
print(len(y_train_resampled))
print(len(x_test_resampled))
print(len(y_test_resampled))

In [None]:
from collections import Counter
Counter(y_train_resampled).items()
# Now, we can see that the data for both Class 0 and Clas 1 are now same. So the data is Balanced.

#### iv) Feature Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x_train_scaled = scaler.fit_transform(x_train_resampled)
x_test_scaled = scaler.transform(x_test_resampled)

print(len(x_train_scaled))
print(len(x_test_scaled))

# 4. Applying Quantum Models

## Circuit Creation

In [None]:
from qiskit.circuit.library import ZZFeatureMap

num_examples, num_features = x_train_scaled.shape
#print(x_train)
print(num_features)

feature_map = ZZFeatureMap(feature_dimension=num_features, reps=1)
#print(feature_map)

#feature_map.decompose().draw(output="mpl", style="clifford", fold=20)

feature_map.decompose().draw(output="text", style="clifford")

In [None]:
from qiskit.circuit.library import RealAmplitudes

ansatz = RealAmplitudes(num_qubits=num_features, reps=3)
ansatz.decompose().draw(output="mpl", style="clifford", fold=20)

In [None]:
from matplotlib import pyplot as plt
from IPython.display import clear_output

objective_func_vals = []
plt.rcParams["figure.figsize"] = (12, 6)


def callback_graph(weights, obj_func_eval):
    clear_output(wait=True)
    objective_func_vals.append(obj_func_eval)
    plt.title("Objective function value against iteration")
    plt.xlabel("Iteration")
    plt.ylabel("Objective function value")
    plt.plot(range(len(objective_func_vals)), objective_func_vals)
    plt.show()

In [None]:
from qiskit_ibm_runtime import QiskitRuntimeService
from apitoken import get_api_token
token = get_api_token()

service = QiskitRuntimeService(
    channel='ibm_quantum',
    instance='ibm-q/open/main',
    token=token
)



In [None]:
service.backends()

In [None]:
backend = service.backend("ibm_brisbane")

In [None]:
from qiskit import QuantumCircuit
from qiskit_aer import Aer

In [None]:
from qiskit_machine_learning.circuit.library import RawFeatureVector

print("Num features :", num_features)

fm = RawFeatureVector(feature_dimension=num_features) 
fm.draw() 

In [None]:
from qiskit.circuit.library import RealAmplitudes


'''
For angle encoding, num_qubits = num_features
For amplitude encoding, num_qubits = n = np.log2(num_features)
'''

pqc = RealAmplitudes(num_qubits=n, reps=3)
pqc.decompose().draw()

## Model 1. VQC (Variation Quantum Classifier)

In [None]:
import time
from qiskit_machine_learning.algorithms.classifiers import VQC
import gc; gc.collect()

from qiskit_algorithms.optimizers import COBYLA
from qiskit.primitives import Sampler

from qiskit import QuantumCircuit
from qiskit_aer import Aer


optimizer = COBYLA(maxiter=150)
sampler = Sampler()

vqc = VQC(
    sampler=sampler,
    feature_map=fm, #feature_map,
    ansatz=pqc, #ansatx
    optimizer=optimizer,
    callback=callback_graph,
   
)

# clear objective value history
objective_func_vals = []



In [None]:
print(len(x_train_scaled))
print(len(y_train_resampled))

In [None]:
%%time
#start = time.time()
vqc.fit(x_train_scaled, y_train_resampled)
#elapsed = time.time() - start

#print(f"Training time: {round(elapsed)} seconds")

In [None]:
import gc
gc.collect()

print(x_train)
print(x_test)
y_train_pred = vqc.predict(x_train)

In [None]:
y_test_pred = vqc.predict(x_test)

## Model 2. Neural Network Classifier 

# Analyze and Generate Reports

In [None]:
%pip install pickleshare

In [None]:
import pickle
file_name = "sharedfile"
comparison_results = {}
comparison_results.update(pickle.load(open(file_name, "rb")))
#%store -r comparison_results
#%store -r x
print(comparison_results)
#comparison_results = {}
#%store

### Helper Report Generation Functions

# VQC Reports

In [None]:
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred), "= confusion matrix for train set \n")

print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred), "= confusion matrix for test set \n ")
print(round(roc_auc_score(y_test, y_test_pred),2), "= roc_auc_score for test set \n")

In [None]:
comparison_results["VQC"] =show_result(y_test, y_test_pred, labels)

# Conclusion

In [None]:
# Convert the dictionary to a Pandas DataFrame
df = pd.DataFrame(comparison_results).transpose()  # Transpose for better table view

# Print the table
print(df)

# Create a bar chart for accuracy
plt.figure(figsize=(18, 5))
plt.bar(df.index, df["accuracy"])
plt.xlabel("Model Name")
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.show()