In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sns

In [None]:
data_df=pd.read_csv('../creditcard.csv')
data_df.head()

`.isnull()`: This is a method applied to the DataFrame that checks each element in the DataFrame and returns a DataFrame of the same shape where each element is either `True` if it's a missing value `null` or `False` if it's not.<br>
`.sum()`: This is applied to the resulting DataFrame from the previous step. It calculates the sum of True values along each column. Since `True` is treated as `1` and `False` as `0` when summing, this effectively counts the number of True values in each column.<br>
| V1 | V2 | V3 |
|----------|----------|----------|
| 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 1 | 1 |
| 0 | 1 | 1 |
| 1 | 1 | 0 |
|sum = 1|sum = 3|sum = 2|

In [None]:
data_df.isnull().sum()

`.duplicated()`: This is a method applied to the DataFrame that checks each row and returns a Boolean Series where each element is `True` if the corresponding row is a `duplicate of a previous row`, and `False` if it's `not a duplicate`.<br>
`.sum()`:Is same as before<br>
| V1 | V2 | V3 | Duplicate|
|----------|----------|----------|----------|
| 1 | 4 | 3 | 0 |
| 1 | 3 | 5 | 0 |
| 2 | 3 | 4 | 0 |
| 2 | 3 | 4 | 1 |
| 5 | 8 | 9 | 0 |
||||sum=1|

In [None]:
data_df.duplicated().sum()

Drop duplicates value permanently

In [None]:
data_df.drop_duplicates(inplace=True)

The `.drop` method in `pandas` is used to remove specified labels (rows or columns) from a DataFrame in this case `Class` column removed and other are stored.<br>
`axis=1`: Means that you are dropping a column from the DataFrame `data_df`.<br>
`y` is used to represent the target variable or the dependent variable.<br>
`x` is used to represent the features or independent variables, which are used to predict or explain the target variable.

In [None]:
X=data_df.drop('Class',axis=1)
Y=data_df['Class']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=1)

In [None]:
# Check data is balanced or not
data_df['Class'].value_counts()

- Conclusion.<br>
From the above value_counts of 'Class' Column and from the Graph There are about: '283253' Transactions out of which '473' were Fraud Which means the Data is not properly distributed.

In [None]:
# !conda install -c conda-forge imbalanced-learn --yes

In [None]:
from imblearn.over_sampling import RandomOverSampler

In [None]:
y_train.value_counts()

`RandomOverSampler`: This is a specific resampling technique used to address class imbalance. It works by randomly replicating or duplicating instances of the `minority` class (the class with fewer examples) until both classes are balanced.<br>

`random_state=1`: This is a parameter provided to the `RandomOverSampler`. It is used to set a random seed for reproducibility. By setting random_state to a specific value (in this case, 1), ensure that the random oversampling process will produce the same results each time we run this code, which is important for reproducible research

In [None]:
# apply randomoversampler
# create object of randomoversampler class
ros=RandomOverSampler(random_state=1)

`ros.fit_resample(x_train, y_train)`: This line of code applies the Random Over-sampling technique to training data.<br>

`x_train` and `y_train` are passed as arguments to the `fit_resample` method of the ros object.<br>

`fit_resample`: method examines the class distribution in `y_train` (the target variable) and oversamples the minority class (the class with fewer examples) to balance the class distribution.<br>

It returns two sets of data: `x_train_ros` and `y_train_ros`. These sets are now balanced, meaning they have an equal number of examples for each class.<br>

After running this code cell, `x_train_ros` will contain the oversampled `feature data`, and `y_train_ros` will contain the corresponding oversampled `labels`. These balanced datasets can then be used to train machine learning models that won't be biased towards the majority class due to class imbalance.

In [None]:
x_train_ros,y_train_ros=ros.fit_resample(x_train,y_train)

In [None]:
y_train_ros.value_counts()

In [None]:
y_test.value_counts()

Doing the same thing with testing data

In [None]:
x_test_ros,y_test_ros=ros.fit_resample(x_test,y_test)

In [None]:
y_test_ros.value_counts()

`StandardScaler` is used for standardizing features by removing the mean and scaling to unit variance.<br>

`ss = StandardScaler()`: Create an instance of the `StandardScaler` class.<br>

`.fit_transform()`: is a method of the `StandardScaler` object that first calculates the mean and standard deviation of each `feature` in `x_train_ros` and then transforms the data by subtracting the mean and dividing by the standard deviation for each feature. This process standardizes the features.<br>

`.transform()`: is used to apply the same mean and standard deviation calculated during the training data standardization to the test data. This ensures that the test data is scaled in the same way as the training data.

In [None]:
# from sklearn.preprocessing import StandardScaler
# ss=StandardScaler()
# x_train=ss.fit_transform(x_train_ros)
# x_test=ss.transform(x_test_ros)

In [None]:
data_df.shape

In [None]:
LogisticReg=LogisticRegression()
LogisticReg.fit(x_train_ros,y_train_ros)

In [None]:
y_pred=LogisticReg.predict(x_test_ros)
y_pred

accuracy=(TP+TN)/(TP+TN+FP+FN)

In [None]:
accuracy=LogisticReg.score(x_train_ros,y_train_ros)
accuracy

In [None]:
cm=metrics.confusion_matrix(y_test_ros,y_pred)
cm

In [None]:
plt.figure(figsize=(7,5))
sns.heatmap(cm,annot=True,cmap='YlGnBu',xticklabels=['Negative','Positive'],yticklabels=['Negative','Positive'])
plt.xlabel('Predicted value')
plt.ylabel('Actual value')

In [None]:
new_data=pd.read_csv('../new.csv')
new_pred=LogisticReg.predict(new_data)
new_pred