# Aula 6 - Machine Learning

# Demo 13 - Detecção de fraude com SVM

[Credit card data](https://www.kaggle.com/dalpozz/creditcardfraud).

In [1]:
import numpy as np
import pandas as pd

In [None]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('Arquivo "{name}" importado pelo usuário com tamanho de {length} bytes'.format(name=fn, length=len(uploaded[fn])))

In [None]:
def get_fraud_percentage():
    df = pd.read_csv('creditcard.csv')
    return (df[df['Class']==1].shape[0]/df.shape[0]) * 100

get_fraud_percentage()

In [None]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('creditcard.csv')

X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
def get_dummy_classifier_accuracy_and_recall():
    from sklearn.dummy import DummyClassifier
    from sklearn.metrics import recall_score

    dc = DummyClassifier()
    dc.fit(X_train, y_train)

    y_pred=dc.predict(X_test)

    rc = recall_score(y_test, y_pred)
    acc = dc.score(X_test, y_test)
    return (acc, rc)

dummyAccuracy, dummyRecall = get_dummy_classifier_accuracy_and_recall()

print("Dummy Accuracy: ", dummyAccuracy)
print("Dummy Recall: ", dummyRecall)

In [None]:
def get_accuracy_recall_and_precision():
    from sklearn.metrics import recall_score, precision_score
    from sklearn.svm import SVC
    svc=SVC()
    svc.fit(X_train, y_train)
    y_pred=svc.predict(X_test)

    acc=svc.score(X_test, y_test)
    rc=recall_score(y_test, y_pred)
    pc=precision_score(y_test, y_pred)

    return (acc,rc,pc)

accuracy, recall, precision = get_accuracy_recall_and_precision()

print("Accuracy: ", accuracy)
print("Recall: ", recall)
print("Precision:", precision)

In [None]:
def get_confusion_matrix():
    from sklearn.metrics import confusion_matrix
    from sklearn.svm import SVC

    svc=SVC(C=1e9, gamma=1e-07)
    svc.fit(X_train, y_train)

    y_df = svc.decision_function(X_test)

    cm=confusion_matrix(y_test, y_df > -220)

    return cm

get_confusion_matrix()

In [None]:
def plot_precision_recall_and_FPR_to_TPR():
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import precision_recall_curve, roc_curve
    import matplotlib.pyplot as plt

    cls = LogisticRegression()
    cls.fit(X_train, y_train)
    y_prob = cls.decision_function(X_test)

    precision, recall, threshold1 = precision_recall_curve(y_test, y_prob)
    fpr, tpr, threshold2 = roc_curve(y_test, y_prob)

    recall_queried = recall[np.argmin(abs(precision - 0.750))]
    tpr_queried = tpr[np.argmin(abs(fpr - 0.160))]

    fig1=plt.figure(figsize=(8, 6), dpi=400)
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.01])
    plt.plot(precision, recall, label='Precision-Recall Curve')
    plt.plot(0.75, recall_queried, 'x', markersize = 0.5, fillstyle = 'none', c='r', mew=3)


    plt.xlabel('Precision', fontsize=16)
    plt.ylabel('Recall', fontsize=16)
    plt.axes().set_aspect('equal')
    plt.show()
    fig1.savefig("PrecisionRecallCurve.svg", format="svg")


    fig2=plt.figure(figsize=(8, 6), dpi=400)
    plt.xlim([0.0, 1.01])
    plt.ylim([0.0, 1.01])
    plt.plot(fpr, tpr, label='FPR - TPR Curve')
    plt.plot(0.16, tpr_queried, 'x', markersize = 0.5, fillstyle = 'none', c='r', mew=3)
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)
    plt.axes().set_aspect('equal')
    plt.show()
    fig2.savefig("TPR-FPR Curve.svg", format="svg")

plot_precision_recall_and_FPR_to_TPR()

In [None]:
def get_mean_test_scores():
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline

    pipeline = Pipeline([('clf', LogisticRegression()),])

    parameters = {'clf__penalty': ('l1', 'l2'), 'clf__C': (0.01, 0.1, 1, 10, 100)}
    grid= GridSearchCV(pipeline, param_grid=parameters, scoring='recall', verbose=2, cv=3)
    print(pipeline.get_params().keys())
    grid.fit(X, y)
    results = grid.cv_results_
    return results

results=get_mean_test_scores();

In [None]:
results

In [None]:
def GridSearch_Heatmap(scores):
    %matplotlib notebook
    import seaborn as sns
    import matplotlib.pyplot as plt
    plt.figure()
    sns.heatmap(scores.reshape(5,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 1, 10, 100])
    plt.yticks(rotation=0);

GridSearch_Heatmap(results['mean_test_score'].reshape(5, 2))