# Aula 5 - Machine Learning

# Demo 11 - Machine learning - Regressão Logística

### Algoritmo para classificação de transações em possíveis fraudes/não fraude.

In [None]:
# Import de nossas bibliotecas

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score,precision_score,recall_score)

In [None]:
# Upload de nosso arquivo. Ref. - https://www.kaggle.com/pwnpen/payment

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving payment_fraud.csv to payment_fraud.csv
User uploaded file "payment_fraud.csv" with length 1479492 bytes


In [None]:
# Lendo dados a partir de nosso arquivo .csv
df = pd.read_csv('payment_fraud.csv')

In [None]:
# Amostragem de cabeçalho e cinco primeiras linhas de nosso dataset
df.sample(5)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethod,paymentMethodAgeDays,label
23800,371,1,5.034622,creditcard,0.0,0
28852,279,1,5.040929,creditcard,0.0,0
24603,1338,1,4.057414,creditcard,643.348611,0
6173,17,1,4.461622,creditcard,0.0,0
18954,15,1,4.836982,creditcard,14.118056,0


In [None]:
# Verificando linhas e partir do índice de nosso dataset
len(df.index)

39221

In [None]:
# Verificando colunas de nosso dataset
df.columns

Index(['accountAgeDays', 'numItems', 'localTime', 'paymentMethod',
       'paymentMethodAgeDays', 'label'],
      dtype='object')

In [None]:
# Verificando itens únicos de nossa coluna 'paymentMethod' de nosso dataset
(df['paymentMethod'].unique())

array(['paypal', 'storecredit', 'creditcard'], dtype=object)

In [None]:
# Agrupando número de itens de nossa coluna 'numItems' de nosso dataset
df.groupby('numItems').size().reset_index()

Unnamed: 0,numItems,0
0,1,37398
1,2,1348
2,3,164
3,4,42
4,5,168
5,6,15
6,7,5
7,8,5
8,9,1
9,10,71


In [None]:
# Tratando nossos dados da coluna 'pamentMethod' para formato numérico - Ref. https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
df_one_hot = pd.get_dummies(df, columns=['paymentMethod'])

In [None]:
df_one_hot.sample(3)

Unnamed: 0,accountAgeDays,numItems,localTime,paymentMethodAgeDays,label,paymentMethod_creditcard,paymentMethod_paypal,paymentMethod_storecredit
11692,2000,1,4.876771,857.723611,0,1,0,0
20270,32,1,4.505662,0.002083,0,1,0,0
37396,811,1,4.745402,0.0,0,1,0,0


In [None]:
# Divisão em sets de treinamento/teste (Regra de Pareto - 80/20 - Ref. https://pt.wikipedia.org/wiki/Princ%C3%ADpio_de_Pareto)
X_train, X_test, y_train, y_test = train_test_split(df_one_hot.drop('label', axis=1), df_one_hot['label'], test_size=0.2, random_state=42)

In [None]:
len(X_train.columns)

7

In [None]:
# Construção de nosso modelo usando Regressão Logística (Ref. https://edisciplinas.usp.br/pluginfile.php/3769787/mod_resource/content/1/09_RegressaoLogistica.pdf)
clf = LogisticRegression(max_iter=20).fit(X_train, y_train)

# Predição em dados de teste
y_pred = clf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Avaliação de Acurácia, Precisão e Recall de nosso modelo
accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

print("acuracia", accuracy)
print("precisao", precision)
print("recall", recall)

acuracia 1.0
precisao 1.0
recall 1.0


In [None]:
# Utilizando Árvore de Decisão
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

# Avaliação de Acurácia, Precisão e Recall de nosso modelo
accuracy = accuracy_score(y_pred, y_test)
precision = precision_score(y_pred, y_test)
recall = recall_score(y_pred, y_test)

print("acuracia", accuracy)
print("precisao", precision)
print("recall", recall)

acuracia 1.0
precisao 1.0
recall 1.0
