### Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Data Preprocessing

In [None]:
#Loading the dataset into a pandas dataframe
cc_dataset = pd.read_csv('data/creditcard.csv')

In [None]:
#print first 5 rows of the dataset
cc_dataset.head()

In [None]:
#print the number of rows and columns
print(cc_dataset.shape)

#check for missing values
cc_dataset.isnull().sum()

In [None]:
#checking for the trasaction type distribution using a barchart
sns.countplot(data=cc_dataset, x=cc_dataset['Class'])

In [None]:
#Counting the number of fraudulet transactions
cc_dataset['Class'].value_counts()

 0 -> Legit transactions   
 1 -> Fraudulent transactions

The dataset is highly unbalanced

In [None]:
#separating the data for analysis
legit = cc_dataset[cc_dataset.Class == 0]
fraud = cc_dataset[cc_dataset.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

In [None]:
#statisticsl measures of the data
legit.Amount.describe()


In [None]:
fraud.Amount.describe()

In [None]:
#Compare the values for both transactions
cc_dataset.groupby('Class').mean()

To deal with the unbalancedness of the dataset, we can use a method called Under-Sampling. This is the process of building a sample dataset containing similar distribution of Legit transactions and Fradulent ones. 
   
Since we 492 fraudulent transactions, we will take asample of 492 legit transactions and join it to the fraudulent transactiions to form a new dataset that we will train our model on.

In [None]:
#selecting a sampled from the legit df
legit_sample = legit.sample(n=492)

In [None]:
#concatenaing the two dataframes
new_dataset = pd.concat([legit_sample, fraud], axis = 0)

In [None]:
#printing the first 5 rows
new_dataset.head()

In [None]:
#checking the distribution of the new dataframe
new_dataset['Class'].value_counts()

In [None]:
#splitting the data into feautures and target
x = new_dataset.drop(columns='Class', axis= 1)
y = new_dataset['Class']

#### Model Training and Evaluation

In [None]:
#splitting into training and test data
x_test, x_train, y_test, y_train = train_test_split(x,y, test_size=0.2, stratify=y, random_state=2)

In [None]:
#training the model
model = LogisticRegression()
model.fit(x_train, y_train)

In [None]:
#accuracy score on training data
x_train_prediction = model.predict(x_train)
train_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('The accuracy score is: ', train_data_accuracy)

In [None]:
#accuracy score on test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('The accuracy score is: ', test_data_accuracy)