In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from sklearn.model_selection import cross_val_score
from sklearn import metrics

import random
from scipy import stats
from scipy.stats import mode, norm, skew
from math import sqrt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
# Read dataset
d = pd.read_csv('Consumptions.csv', sep = ';')
d

Unnamed: 0,CUPS,ZipCode,Rate,Date,Hour,Value
0,F0216555032235ABF9A1515F221C3F7F,11403,2.0A,2016-06-01,1,502
1,F0216555032235ABF9A1515F221C3F7F,11403,2.0A,2016-06-01,2,418
2,F0216555032235ABF9A1515F221C3F7F,11403,2.0A,2016-06-01,3,418
3,F0216555032235ABF9A1515F221C3F7F,11403,2.0A,2016-06-01,4,418
4,F0216555032235ABF9A1515F221C3F7F,11403,2.0A,2016-06-01,5,418
...,...,...,...,...,...,...
3591726,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2.0DHA,2017-06-30,22,288
3591727,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2.0DHA,2017-06-30,23,480
3591728,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2.0DHA,2017-06-30,23,480
3591729,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2.0DHA,2017-06-30,24,356


In [4]:
d.head()
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3591731 entries, 0 to 3591730
Data columns (total 6 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   CUPS     object
 1   ZipCode  int64 
 2   Rate     object
 3   Date     object
 4   Hour     int64 
 5   Value    int64 
dtypes: int64(3), object(3)
memory usage: 164.4+ MB


In [5]:
d.isna().sum()

CUPS       0
ZipCode    0
Rate       0
Date       0
Hour       0
Value      0
dtype: int64

In [7]:
d['Rate'].unique()


array(['2.0A', '2.0DHA', '2.1A', '2.1DHA', '3.0A'], dtype=object)

In [10]:
d.drop('CUPS',axis=1).head(30)

Unnamed: 0,ZipCode,Rate,Date,Hour,Value
0,11403,2.0A,2016-06-01,1,502
1,11403,2.0A,2016-06-01,2,418
2,11403,2.0A,2016-06-01,3,418
3,11403,2.0A,2016-06-01,4,418
4,11403,2.0A,2016-06-01,5,418
5,11403,2.0A,2016-06-01,6,418
6,11403,2.0A,2016-06-01,7,418
7,11403,2.0A,2016-06-01,8,417
8,11403,2.0A,2016-06-01,9,1084
9,11403,2.0A,2016-06-01,10,2197


In [92]:
def encode_type(x):
    if x == '2.0A':
        return 1
    if x == '2.0DHA':
        return 2
    if x == '2.1A':
        return 3
    if x == '2.1DHA':
        return 4
    if x == '3.0A':
        return 5
    
d['Rate'] = d['Rate'].transform(encode_type)
d

Unnamed: 0,CUPS,ZipCode,Rate,Date,Hour,Value
0,F0216555032235ABF9A1515F221C3F7F,11403,1,2016-06-01,1,502
1,F0216555032235ABF9A1515F221C3F7F,11403,1,2016-06-01,2,418
2,F0216555032235ABF9A1515F221C3F7F,11403,1,2016-06-01,3,418
3,F0216555032235ABF9A1515F221C3F7F,11403,1,2016-06-01,4,418
4,F0216555032235ABF9A1515F221C3F7F,11403,1,2016-06-01,5,418
...,...,...,...,...,...,...
3591726,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2,2017-06-30,22,288
3591727,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2,2017-06-30,23,480
3591728,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2,2017-06-30,23,480
3591729,22C0D39C8F5117F526F8A13BBE0D28C6,39300,2,2017-06-30,24,356


In [93]:
rate1 = d[d.Rate == 1]
rate2 = d[d.Rate == 2]
rate3 = d[d.Rate == 3]
rate4 = d[d.Rate == 4]
rate5 = d[d.Rate == 5]

print(len(rate1))
print(len(rate2))
print(len(rate3))
print(len(rate4))
print(len(rate5))

985079
2292787
135069
170085
8711


In [99]:
rate2_indexes = rate2.index
rate4_indexes = rate4.index
rate3_indexes = rate3.index
rate1_indexes = rate1.index

# get as many fraud indexes (randomly) as non fraud occurrences
np.random.seed(42)
random_rate2_indexes = np.random.choice(rate2_indexes, len(rate5))
random_rate4_indexes = np.random.choice(rate4_indexes, len(rate5))
random_rate3_indexes = np.random.choice(rate3_indexes, len(rate5))
random_rate1_indexes = np.random.choice(rate1_indexes, len(rate5))

# Keep those entries in not fraud
rate2 = d.loc[random_rate2_indexes]
rate4 = d.loc[random_rate4_indexes]
rate3 = d.loc[random_rate3_indexes]
rate1 = d.loc[random_rate1_indexes]

# We will form the balanced dataset concatenating fraud and non_fraud
d = rate5.append(rate2)
d = rate5.append(rate4)
d = rate5.append(rate3)
d = rate5.append(rate1)

# Check the result
print(len(rate1))
print(len(rate2))
print(len(rate3))
print(len(rate4))
print(len(rate5))

KeyError: "None of [Int64Index([1870858, 2830124, 2476553,   18946, 1049401, 1036457, 1159253,\n            1902366,  610172,  607196,\n            ...\n             747211, 3367824,  476463, 3054122, 1733634,  888963,  698381,\n            2567893, 2882801, 3183493],\n           dtype='int64', length=2292787)] are in the [index]"