In [86]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [73]:
#load the data
df=pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv", sep=",")
df.head()

In [3]:
df.shape

In [4]:
df.info()

In [5]:
df.describe()

In [8]:
#check if there are null values
df.isna().sum()

## Class is the variable that indicates if there's fraud or not

In [9]:
#check class values
df.Class.value_counts()

**0= Normal**

**1=Fraud**

In [23]:
len(df.Class)

In [11]:
#visualize the class category
df["Class"].value_counts().plot(kind="bar", color = ["salmon", "lightblue"]);

**We can see that this data is unbalanced just like the description**

In [21]:
#print percentage of classes**
percentage=df.Class.value_counts()/len(df.Class)
print(percentage*100)

In [24]:
#checking class distribution
df.Class.plot.hist();

In [56]:
#separating the classes
normal=df[df.Class==0]
fraud=df[df.Class==1]

In [57]:
normal.shape, fraud.shape

In [58]:
normal.Amount.describe()

In [59]:
fraud.Amount.describe()

In [66]:
#visualizing the distribution of amounts with respect to the Class
f, (ax1, ax2) = plt.subplots(2,1, 
                             figsize=(10,8),
                             sharex = True)
f.suptitle("Amount per transaction by class")
bins = 50
ax1.hist(fraud.Amount, bins=bins)
ax1.set_title('Fraud')

ax2.hist(normal.Amount, bins=bins)
ax2.set_title('normal')

plt.xlabel("Amount ($)")
plt.ylabel("No. of Transaction")

plt.xlim(0,20000)
plt.yscale('log')
plt.show()

In [76]:
#comparing amounts with time and Class
plt.style.use('seaborn-whitegrid')

fig,ax=plt.subplots(figsize=(10,8))
scatter=ax.scatter(x=df.Time[:1000],
                   y=df.Amount[:1000],
                   c=df.Class[:1000],
                   cmap="winter");

#customize the plot
ax.set(title="Comparison of Amount and Time to Class",
      xlabel="Time of psyment",
      ylabel="Amount");

#add legend
ax.legend(*scatter.legend_elements(),title="Class");



**This shows a lot of inbalance in the data set, having one or two fraud showing for the first 1000 rows**

## Finding patterns with Correlation Matrix

Correlation Matrix is how each independent variable is related to one another

In [79]:
df.corr()

In [83]:
#lets visualize it with seaborns heat map
corr_matrix=df.corr()
plt.figure(figsize=(20,20))
sns.heatmap(corr_matrix,
            annot=True,
            linewidth=0.5,
            cmap="YlGnBu");

## Modelling

In [84]:
#copy data first
data=df.copy()

In [85]:
data.head()

### import various models for classification



In [89]:
#models from scikit-learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

#model evaluations
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

In [90]:
#split the data
X=df.drop("Class", axis=1)
y=df["Class"]

In [91]:
X

In [92]:
y


In [93]:
#Split the data into train and test
np.random.seed(42)

X_train,X_test, y_train,y_test=train_test_split(X,y, test_size=0.2)

In [95]:
len(X_train), len(y_train),len(X_test), len(y_test)

In [96]:
#crate a dict for the models
models={"Random Forest": RandomForestClassifier(),
        "Logistic Regression": LogisticRegression(),
        "KNN": KNeighborsClassifier()}

#create a function to fit and score models
def fit_and_score(models,X_train,X_test,y_train,y_test):
    """
    fit and evaluate the model with the dictionary for Ml Classification
    """
    #set up random seed
    np.random.seed(42)
    
    #setup empty dict to score model
    model_score={}
    
    #loop through the models
    for name, model in models.items():
        #train the data
        model.fit(X_train,y_train)
        #evaluate and append to model_score
        model_score[name]=model.score(X_test,y_test)
        
    return model_score

In [97]:
model_score=fit_and_score(models=models,
                         X_train=X_train,
                         X_test=X_test,
                         y_train=y_train,
                         y_test=y_test)

model_score

In [98]:
#model Comparisons
model_compare=pd.DataFrame(model_score, index=["accuracy"])
model_compare.T.plot.bar();

**Random Forest Scores highest, but however we need to use the ROC with precision and recall as the default metrics for Classification is Accuracy**`

In [126]:


model=RandomForestClassifier(n_jobs=-1,
                            max_depth=14,
                            random_state=13)

model.fit(X_train,y_train)

y_preds=model.predict(X_test)

In [128]:
y_preds

In [129]:
y_test

In [131]:
model.score(X_test,y_test)

In [130]:
print(classification_report(y_test,y_preds))

In [125]:
RandomForestClassifier().get_params()

In [132]:
#plot roc curve and calculate metric
plot_roc_curve(model,X_test,y_test)

**Area under the Curve is 98%**

# Conclusions
**We were able to achieve a 98% score with Area under the Receiver Operating Characteristics** 