# Logistic Regression and SVM for Credit Card Fraud Detection

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

In [28]:
data = pd.read_csv("creditcardfraud.csv")

In [29]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,82450,1.314539,0.590643,-0.666593,0.716564,0.301978,-1.125467,0.388881,-0.288390,-0.132137,...,-0.170307,-0.429655,-0.141341,-0.200195,0.639491,0.399476,-0.034321,0.031692,0.76,0
1,50554,-0.798672,1.185093,0.904547,0.694584,0.219041,-0.319295,0.495236,0.139269,-0.760214,...,0.202287,0.578699,-0.092245,0.013723,-0.246466,-0.380057,-0.396030,-0.112901,4.18,0
2,55125,-0.391128,-0.245540,1.122074,-1.308725,-0.639891,0.008678,-0.701304,-0.027315,-2.628854,...,-0.133485,0.117403,-0.191748,-0.488642,-0.309774,0.008100,0.163716,0.239582,15.00,0
3,116572,-0.060302,1.065093,-0.987421,-0.029567,0.176376,-1.348539,0.775644,0.134843,-0.149734,...,0.355576,0.907570,-0.018454,-0.126269,-0.339923,-0.150285,-0.023634,0.042330,57.00,0
4,90434,1.848433,0.373364,0.269272,3.866438,0.088062,0.970447,-0.721945,0.235983,0.683491,...,0.103563,0.620954,0.197077,0.692392,-0.206530,-0.021328,-0.019823,-0.042682,0.00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,160243,-2.783865,1.596824,-2.084844,2.512986,-1.446749,-0.828496,-0.732262,-0.203329,-0.347046,...,0.203563,0.293268,0.199568,0.146868,0.163602,-0.624085,-1.333100,0.428634,156.00,1
596,110547,-1.532810,2.232752,-5.923100,3.386708,-0.153443,-1.419748,-3.878576,1.444656,-1.465542,...,0.632505,-0.070838,-0.490291,-0.359983,0.050678,1.095671,0.471741,-0.106667,0.76,1
597,70071,-0.440095,1.137239,-3.227080,3.242293,-2.033998,-1.618415,-3.028013,0.764555,-1.801937,...,0.764187,-0.275578,-0.343572,0.233085,0.606434,-0.315433,0.768291,0.459623,227.30,1
598,93879,-13.086519,7.352148,-18.256576,10.648505,-11.731476,-3.659167,-14.873658,8.810473,-5.418204,...,2.761157,-0.266162,-0.412861,0.519952,-0.743909,-0.167808,-2.498300,-0.711066,30.31,1


In [30]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### processing the data

In [31]:
# Drop the 'Time' column
data.drop('Time', axis=1,inplace = True)

In [32]:
data.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class'],
      dtype='object')

In [33]:
# Scale the 'Amount' column using a standard scaler
scaler = StandardScaler()
data['Amount'] = scaler.fit_transform(data[['Amount']])
data['Amount']

0     -0.443051
1     -0.427821
2     -0.379637
3     -0.192602
4     -0.446435
         ...   
595    0.248265
596   -0.443051
597    0.565779
598   -0.311458
599   -0.001159
Name: Amount, Length: 600, dtype: float64

### Spliting the data

In [34]:
X = data.drop('Class', axis=1)
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)


In [35]:
# 4. Train a logistic regression model on the training set
logistic_model = LogisticRegression(solver = 'liblinear', random_state=42)
logistic_model.fit(X_train, y_train)


In [38]:
# 5. Evaluate the logistic regression model
y_pred_logistic = logistic_model.predict(X_test)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)
accuracy_score(y_pred_logistic,y_test)

0.9333333333333333

In [39]:
#checking accuracy with sag
logistic_model = LogisticRegression(solver = 'sag', random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)
accuracy_score(y_pred_logistic,y_test)



0.9416666666666667

In [40]:
#checking accuracy with saga
logistic_model = LogisticRegression(solver = 'saga', random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)
accuracy_score(y_pred_logistic,y_test)



0.9333333333333333

In [41]:
#checking accuracy with lbfgs
logistic_model = LogisticRegression(solver = 'lbfgs', random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)
accuracy_score(y_pred_logistic,y_test)

0.9416666666666667

In [43]:
#checking accuracy with newton-cg, so we are choosing newton-cg as the highest
logistic_model = LogisticRegression(solver = 'newton-cg', random_state=42)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
conf_matrix_logistic = confusion_matrix(y_test, y_pred_logistic)
class_report_logistic = classification_report(y_test, y_pred_logistic)
accuracy_score(y_pred_logistic,y_test)

0.9416666666666667

In [47]:
y_pred_logistic

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

In [48]:
conf_matrix_logistic #The confusion matrix is a table that is often used to describe the performance of a classification model on a set of test data for which the true values are known. In the context of binary classification (two classes, typically denoted as positive and negative), the confusion matrix has four entries

array([[59,  3],
       [ 4, 54]], dtype=int64)

In [50]:
class_report_logistic #The classification_report function provides a more comprehensive evaluation of the model's performance by calculating several metrics for each class. In binary classification, there are two classes: positive (usually denoted as 1) and negative (usually denoted as 0).

'              precision    recall  f1-score   support\n\n           0       0.94      0.95      0.94        62\n           1       0.95      0.93      0.94        58\n\n    accuracy                           0.94       120\n   macro avg       0.94      0.94      0.94       120\nweighted avg       0.94      0.94      0.94       120\n'

## Svm

In [52]:
# 6. Train an SVM model on the training set using kernel as linear
svm_model = SVC(kernel = 'linear',random_state=42)
svm_model.fit(X_train, y_train)

In [54]:
# 7. Evaluate the SVM model
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_score(y_pred_svm,y_test)


0.925

In [55]:
#Train an SVM model on the training set using kernel as poly
svm_model = SVC(kernel = 'poly',random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_score(y_pred_svm,y_test)


0.8666666666666667

In [56]:
#Train an SVM model on the training set using kernel as rbf
svm_model = SVC(kernel = 'rbf',random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_score(y_pred_svm,y_test)


0.95

In [57]:
#Train an SVM model on the training set using kernel as sigmoid
svm_model = SVC(kernel = 'sigmoid',random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_score(y_pred_svm,y_test)


0.9416666666666667

In [59]:
#Train an SVM model on the training set using kernel as rbf we will using rbf as it is showing highest accuracy
svm_model = SVC(kernel = 'rbf',random_state=42)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
class_report_svm = classification_report(y_test, y_pred_svm)
accuracy_score(y_pred_svm,y_test)

0.95

In [60]:
class_report_svm

'              precision    recall  f1-score   support\n\n           0       0.91      1.00      0.95        62\n           1       1.00      0.90      0.95        58\n\n    accuracy                           0.95       120\n   macro avg       0.96      0.95      0.95       120\nweighted avg       0.95      0.95      0.95       120\n'

In [61]:
conf_matrix_svm

array([[62,  0],
       [ 6, 52]], dtype=int64)

In [62]:
y_pred_svm

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

## so it is evident that svm is more accurate than logistic regression with higher accuracy score of 95%

In [19]:

# 8. Tune hyperparameters using grid search cross-validation
# Logistic Regression hyperparameter tuning
param_grid_logistic = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_logistic = GridSearchCV(LogisticRegression(random_state=42), param_grid_logistic, cv=5)
grid_logistic.fit(X_train, y_train)
best_params_logistic = grid_logistic.best_params_


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [20]:
best_params_logistic

{'C': 0.1}

In [21]:
# SVM hyperparameter tuning
param_grid_svm = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid_svm = GridSearchCV(SVC(random_state=42), param_grid_svm, cv=5)
grid_svm.fit(X_train, y_train)
best_params_svm = grid_svm.best_params_

In [22]:
best_params_svm

{'C': 0.1, 'kernel': 'linear'}

In [23]:
# 9. Train models with optimal hyperparameters
optimal_logistic_model = LogisticRegression(**best_params_logistic, random_state=42)
optimal_logistic_model.fit(X_train, y_train)

optimal_svm_model = SVC(**best_params_svm, random_state=42)
optimal_svm_model.fit(X_train, y_train)


In [70]:
# 10. Evaluate models with optimal hyperparameters
y_pred_optimal_logistic = optimal_logistic_model.predict(X_test)
conf_matrix_optimal_logistic = confusion_matrix(y_test, y_pred_optimal_logistic)
class_report_optimal_logistic = classification_report(y_test, y_pred_optimal_logistic)

y_pred_optimal_svm = optimal_svm_model.predict(X_test)
conf_matrix_optimal_svm = confusion_matrix(y_test, y_pred_optimal_svm)
class_report_optimal_svm = classification_report(y_test, y_pred_optimal_svm)

In [63]:
y_pred_optimal_logistic

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

In [64]:
conf_matrix_optimal_logistic

array([[61,  1],
       [ 4, 54]], dtype=int64)

In [65]:
class_report_optimal_logistic

'              precision    recall  f1-score   support\n\n           0       0.94      0.98      0.96        62\n           1       0.98      0.93      0.96        58\n\n    accuracy                           0.96       120\n   macro avg       0.96      0.96      0.96       120\nweighted avg       0.96      0.96      0.96       120\n'

In [69]:
y_pred_optimal_svm

array([0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 1], dtype=int64)

In [71]:
conf_matrix_optimal_svm

array([[62,  0],
       [ 5, 53]], dtype=int64)

In [72]:
class_report_optimal_svm

'              precision    recall  f1-score   support\n\n           0       0.93      1.00      0.96        62\n           1       1.00      0.91      0.95        58\n\n    accuracy                           0.96       120\n   macro avg       0.96      0.96      0.96       120\nweighted avg       0.96      0.96      0.96       120\n'

## It is evident that after tuning the accuracy of svm and logistic regg has increased to 96% for both. svm has better prediction than logistic reggression.  