### Carregamento dos Dados

In [3]:
import pandas as pd

df = pd.read_csv('../data/creditcard.csv')

In [4]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
df.shape

(284807, 31)

### Exploração dos Dados

In [6]:
# Filtragem de nulos no dataframe

df.isnull().sum().sum()

0

O resultado nos diz que não há nulos...

In [7]:
# Filtragem de duplicados no dataframe

df[df.duplicated()].shape

(1081, 31)

In [8]:
df[df.duplicated()][df['Class'] == 1].shape

  df[df.duplicated()][df['Class'] == 1].shape


(19, 31)

Temos 1081 transações duplicadas, sendo que 19 delas são da classe minoritária (fraudulenta). Vamos eliminá-las...

In [9]:
df = df.drop_duplicates()

df[df.duplicated()].shape

(0, 31)

### Transformação dos Dados e SMOTE

Antes de aplicarmos o K-means, precisamos transformar nossos dados para um formato que melhor se adeque ao modelo. Em adicão a isso, aplicamos o SMOTE para que possamos compensar a classe minoritária.

In [10]:
from imblearn.over_sampling import SMOTE

dividimos os dados

In [11]:
X = df.drop('Class', axis=1) 
y = df['Class']

Aplicamos o SMOTE

In [12]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [13]:
df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
df_resampled['Class'] = y_resampled

Juntamos os dados para um csv maior

Vamos fazer um standardization também, para o valores poderem ser usados pelo modelos

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_resampled["Amount"] =  scaler.fit_transform(df_resampled["Amount"].values.reshape(-1,1))


Visualizacao dos dados tratados

In [15]:
df_resampled

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.000000,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.238968,0
1,0.000000,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,-0.424565,0
2,1.000000,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.273308,0
3,1.000000,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.121010,0
4,2.000000,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,-0.120640,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566501,41183.445750,-7.151786,5.337862,-10.524210,6.638176,-7.107253,-3.688276,-11.629728,4.641323,-6.040679,...,2.101089,-0.223411,0.029599,0.393875,-0.049155,0.544700,1.306198,0.223600,-0.357981,1
566502,14264.057012,-4.705909,8.250299,-14.559164,9.787990,-4.117161,-3.169121,-10.169571,4.500750,-5.177936,...,1.988732,0.194092,0.480124,-1.079562,-0.444215,0.557467,1.234855,0.627900,-0.429040,1
566503,135064.971368,0.407366,1.352354,-4.892782,3.554728,-1.282114,-1.271505,-0.977517,0.462476,-1.442286,...,0.765180,0.703300,0.814076,0.814930,-0.300969,0.168339,0.236775,0.037244,0.841150,1
566504,109361.849259,0.580032,2.773047,-5.790024,5.606953,0.398480,-0.621865,-2.954884,0.837074,-1.822486,...,0.311488,-0.810361,-0.067584,-0.632356,0.414552,0.073243,0.736440,0.358067,-0.417994,1


Exportamos os dados para um csv, para uso posteriori

In [16]:
#df_resampled.to_csv("../data/creditcard_treated.csv")