In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')

df.head()

In [None]:
df.info()

In [None]:
df.drop('Time',axis=1,inplace=True)

In [None]:
import matplotlib.pyplot as plt

print('-'*30)
print('Total rows in data:',df.shape[0])
print('-'*30)
print(df.Class.value_counts())
print('-'*30)
df['Class'].value_counts().plot(kind='barh')

### From above distribution of output we can conclude it is case of extreme imbalanced data.
### Imbalanced data are problems because in a two-class problem with a class distribution of 90:10, the performance of the classifier on majority-class examples will count nine times as much as the performance on minority-class(in this case it is almost 99:1).


### Using PCA for dimensionality reduction

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
principalComponents = pca.fit_transform(df.drop('Class',axis=1))

principalDf = pd.DataFrame(data = principalComponents, 
                           columns = ['principal component 1', 'principal component 2'])
final_df = pd.concat([df['Class'], principalDf],axis=1)
final_df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(final_df.drop('Class',axis=1), 
                                                  final_df['Class'],
                                                  test_size=0.2,
                                                  random_state=100)

## Let's use combination of random undersampling and oversampling approach to deal with imbalanced data. 
## I'll be using support vector machine algo.

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

print('Distribution of y_train set Before over and under sampling: ', Counter(y_train))

under = RandomUnderSampler(sampling_strategy=0.002)
over = SMOTE(sampling_strategy=0.01)

X_train_smote, y_train_smote = under.fit_resample(X_train, y_train)
X_train_both, y_train_both = over.fit_resample(X_train_smote, y_train_smote)

print('Distribution of y_train set Before over and under sampling: ', Counter(y_train_both))

In [None]:
from sklearn.linear_model import SGDClassifier
#from sklearn.model_selection import cross_val_score
#from sklearn.pipeline import Pipeline

model = SGDClassifier()
model.fit(X_train_both, y_train_both)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_curve

prediction = model.predict(X_test)
print('classification report:', classification_report(prediction, y_test))
print('-'*40)
print('accuracy_score : ',accuracy_score(prediction, y_test))