### 1. Importing Library

In [5]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
#%matplotlib.inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### 2. Loading the dataset

In [6]:
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


1. Lets understand the data now so the data has features such as time (secs),amount (USD), class and V1-V28 are transactions.
2. The time columns give us the elapsed time of every transaction from the first transaction.
3. V1-V28 features of every transaction as the data is of credit card the feature has been converted to numerical values through principal component analysis.
4. Class column gives us the details about the transaction is fraud or legit as '0' represent the transaction is legit and '1' represent the transaction is fraud.



### 3. Understand the Data

In [7]:
df.info()  # we have no missing value in our dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [8]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,3.918649e-15,5.682686e-16,-8.761736e-15,2.811118e-15,-1.552103e-15,2.04013e-15,-1.698953e-15,-1.893285e-16,-3.14764e-15,...,1.47312e-16,8.042109e-16,5.282512e-16,4.456271e-15,1.426896e-15,1.70164e-15,-3.662252e-16,-1.217809e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [10]:
df.shape

(284807, 31)

In [11]:
round(100* (df.isnull().sum()/len(df)),2).sort_values(ascending=False)

Time      0.0
V16       0.0
Amount    0.0
V28       0.0
V27       0.0
V26       0.0
V25       0.0
V24       0.0
V23       0.0
V22       0.0
V21       0.0
V20       0.0
V19       0.0
V18       0.0
V17       0.0
V15       0.0
V1        0.0
V14       0.0
V13       0.0
V12       0.0
V11       0.0
V10       0.0
V9        0.0
V8        0.0
V7        0.0
V6        0.0
V5        0.0
V4        0.0
V3        0.0
V2        0.0
Class     0.0
dtype: float64

In [12]:
#let check the class column 

df['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

0 - Legit transaction
1 - Fraud transaction

We can see the data is highly unbalanced.
As most of the data is in legit transaction and if we feed this data to machine learning model it will always predict normal transaction only.



In [13]:
# seprating the class data 

legit=df[df.Class == 0]
fraud=df[df.Class == 1]

In [14]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [15]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [16]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [17]:
#compare both the transaction

df.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### 4. Handling Unbalanced Data

In [18]:
# We use under-sampling technique to handle the unbalanced data. This will give us even distribution of the data

legit_sample=legit.sample(n=492) #492 because the number of fraud transaction is 492

In [19]:
df1=pd.concat([legit_sample,fraud],axis=0)

In [20]:
df1.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
272249,164986.0,-0.260435,0.574557,0.538932,-0.200957,-0.459723,0.534495,-0.555944,0.471702,0.430043,...,0.329401,1.102122,-0.168549,0.227305,-0.643516,1.187706,-0.045739,0.116571,19.95,0
92183,63815.0,-1.16583,0.819012,-0.16903,-2.981078,-1.022692,-1.302196,-0.237402,0.714564,-0.511848,...,-0.25004,-0.217585,0.045091,0.068758,-0.187185,-0.092308,0.257367,0.135795,4.0,0
193337,130095.0,1.459566,-1.322938,-0.724056,0.56903,-0.857114,0.046052,-0.472163,0.160539,0.961299,...,0.377123,0.493162,-0.003649,0.770614,-0.450753,0.27795,-0.080946,-0.004634,267.0,0
133336,80341.0,-0.87324,0.813928,1.263245,0.914283,1.010495,0.836069,0.782115,-0.290924,0.11026,...,-0.105808,0.269372,-0.360123,-0.79166,-0.109809,-0.295501,-0.454747,-0.163762,16.18,0
220714,142277.0,0.12578,1.040774,-0.322932,-0.420362,0.783886,-1.316633,1.16342,-0.315149,-0.181823,...,0.282922,0.927515,-0.268507,-0.081307,0.000789,-0.175048,0.075224,0.067578,15.1,0


In [21]:
df1['Class'].value_counts() #here we have uniformly distributed data 

0    492
1    492
Name: Class, dtype: int64

In [22]:
df1.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93859.676829,0.032298,-0.127771,-0.013411,-0.0457,-0.067581,0.115402,0.004462,0.000375,-0.013212,...,0.017199,0.028784,-0.0051,0.008861,-0.028065,-0.005402,-0.013688,-0.039852,-0.004901,109.865386
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### 5. Splitting the data into features and target

In [23]:
X=df1.drop(columns='Class',axis=1)
Y=df1['Class']

In [24]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
272249  164986.0 -0.260435  0.574557  0.538932 -0.200957 -0.459723  0.534495   
92183    63815.0 -1.165830  0.819012 -0.169030 -2.981078 -1.022692 -1.302196   
193337  130095.0  1.459566 -1.322938 -0.724056  0.569030 -0.857114  0.046052   
133336   80341.0 -0.873240  0.813928  1.263245  0.914283  1.010495  0.836069   
220714  142277.0  0.125780  1.040774 -0.322932 -0.420362  0.783886 -1.316633   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [25]:
print(Y)

272249    0
92183     0
193337    0
133336    0
220714    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


### 6. Train/Test Split

In [26]:
X_train,X_test,Y_train,Y_test= train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=2)

In [27]:
print(X.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


### 7. Model Development -- Logistic Regression

In [28]:
model= LogisticRegression()

In [29]:
# training the model

model.fit(X_train,Y_train)

LogisticRegression()

### 8. Model Evaluation

In [30]:
X_train_prediction=model.predict(X_train)
train_accuracy=accuracy_score(X_train_prediction, Y_train)

In [31]:
print('Accuracy on training data :', train_accuracy)

Accuracy on training data : 0.9415501905972046


In [32]:
# 94.1 prediction score means our model can predict 94 correct prediction out of 100 sample

In [33]:
X_test_prediction=model.predict(X_test)
test_accuracy=accuracy_score(X_test_prediction,Y_test)

In [34]:
print('Accuracy on training data :', test_accuracy)

Accuracy on training data : 0.9238578680203046


In [38]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit

# ShuffleSplit is used to randomize the each fold
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
cross_val_score(LinearRegression(), X, Y, cv = cv)

<IPython.core.display.Javascript object>

array([0.59427068, 0.64265387, 0.58327485, 0.57790753, 0.60583296])

In [39]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor

from sklearn.linear_model import Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet

def find_best_model_using_gridsearchcv(X,y):
  algos ={
      'linear_regression':{
          'model':LinearRegression(),
          'params': {
              'normalize':[True,False]
          }
      },
        'lasso': {
            'model': Lasso(),
            'params': {
                'alpha': [1,2],
                'selection': ['random', 'cyclic']
            }
        },
        'decision_tree': {
            'model': DecisionTreeRegressor(),
            'params': {
                'criterion' : ['mse','friedman_mse'],
                'splitter': ['best','random']
            }
        }
  }

  scores= []
  cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=2)

  for algo_name, config in algos.items():
      gs =  GridSearchCV(config['model'], config['params'], cv=cv, return_train_score=False)
      gs.fit(X,y)
      scores.append({
          'model': algo_name,
          'best_score': gs.best_score_,
          'best_params': gs.best_params_
      })

  return pd.DataFrame(scores,columns=['model','best_score','best_params'])

find_best_model_using_gridsearchcv(X,Y)

<IPython.core.display.Javascript object>

Unnamed: 0,model,best_score,best_params
0,linear_regression,0.609649,{'normalize': False}
1,lasso,0.367148,"{'alpha': 1, 'selection': 'cyclic'}"
2,decision_tree,0.613231,"{'criterion': 'mse', 'splitter': 'best'}"
