In [125]:
import pandas as pd
import numpy as np

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report

import time

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn import tree
from sklearn.neural_network import MLPClassifier

In [126]:
dataset = pd.read_csv("data/churn-telecom.csv", index_col=False, squeeze=True)
dataset

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,yes,no,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,yes,no,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,no,yes,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,no,no,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,no,no,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,yes,no,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


Explorando um pouco melhor os dados de churn para entender as proporções

In [127]:
#phone number é um dado único para podermos contar
dataset.groupby('churn')['phone number'].count()

churn
False    2850
True      483
Name: phone number, dtype: int64

In [128]:
churnsTrue = dataset["churn"][dataset["churn"] == True]
print ("Percentual de Churn = "+str((churnsTrue.shape[0] / dataset["churn"].shape[0]) * 100 ))

Percentual de Churn = 14.491449144914492


Antes de mais nada, vamos transformar alguns atributos categóricos em numéricos, para utilizarmos também na análise e correlação dos mesmos com o churn

In [129]:
dataset["voice mail plan"] = pd.factorize(dataset["voice mail plan"])[0]
dataset["international plan"] = pd.factorize(dataset["international plan"])[0]
dataset

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
0,KS,128,415,382-4657,0,0,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,371-7191,0,0,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,358-1921,0,1,0,243.4,114,41.38,...,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,375-9999,1,1,0,299.4,71,50.90,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,330-6626,1,1,0,166.7,113,28.34,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,0,0,36,156.2,77,26.55,...,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,370-3271,0,1,0,231.1,57,39.29,...,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,328-8230,0,1,0,180.8,109,30.74,...,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,364-6381,1,1,0,213.8,105,36.35,...,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


Como podemos perceber, os atributos são bem intuitivos quanto a sua finalidade. Vamos aplicar um algoritmo de machine learning sobre os atributos com melhores correlações sem realizar nenhum tipo de transformação 

In [130]:
dataset.corr()

Unnamed: 0,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,churn
account length,1.0,-0.012463,0.024735,-0.002918,-0.004628,0.006216,0.03847,0.006214,-0.006757,0.01926,-0.006745,-0.008955,-0.013176,-0.00896,0.009514,0.020661,0.009546,-0.003796,0.016541
area code,-0.012463,1.0,0.048551,0.000747,-0.001994,-0.008264,-0.009646,-0.008264,0.00358,-0.011886,0.003607,-0.005825,0.016522,-0.005845,-0.018288,-0.024179,-0.018395,0.027572,0.006174
international plan,0.024735,0.048551,1.0,-0.006006,0.008745,0.049396,0.003755,0.049398,0.0191,0.006114,0.019106,-0.028905,0.012451,-0.028913,0.045871,0.017366,0.04578,-0.024522,0.259852
voice mail plan,-0.002918,0.000747,-0.006006,1.0,-0.956927,0.001684,0.011086,0.001686,-0.021545,0.006444,-0.021559,-0.006079,-0.015553,-0.006064,0.001318,-0.007618,0.001276,0.017824,0.102148
number vmail messages,-0.004628,-0.001994,0.008745,-0.956927,1.0,0.000778,-0.009548,0.000776,0.017562,-0.005864,0.017578,0.007681,0.007123,0.007663,0.002856,0.013957,0.002884,-0.013263,-0.089728
total day minutes,0.006216,-0.008264,0.049396,0.001684,0.000778,1.0,0.00675,1.0,0.007043,0.015769,0.007029,0.004323,0.022972,0.0043,-0.010155,0.008033,-0.010092,-0.013423,0.205151
total day calls,0.03847,-0.009646,0.003755,0.011086,-0.009548,0.00675,1.0,0.006753,-0.021451,0.006462,-0.021449,0.022938,-0.019557,0.022927,0.021565,0.004574,0.021666,-0.018942,0.018459
total day charge,0.006214,-0.008264,0.049398,0.001686,0.000776,1.0,0.006753,1.0,0.00705,0.015769,0.007036,0.004324,0.022972,0.004301,-0.010157,0.008032,-0.010094,-0.013427,0.205151
total eve minutes,-0.006757,0.00358,0.0191,-0.021545,0.017562,0.007043,-0.021451,0.00705,1.0,-0.01143,1.0,-0.012584,0.007586,-0.012593,-0.011035,0.002541,-0.011067,-0.012985,0.092796
total eve calls,0.01926,-0.011886,0.006114,0.006444,-0.005864,0.015769,0.006462,0.015769,-0.01143,1.0,-0.011423,-0.002093,0.00771,-0.002056,0.008703,0.017434,0.008674,0.002423,0.009233


Os quatro atributos que mais podem dizer se um cliente deu churn ou não são:

- total day minutes
- total day charge
- customer service calls
- international plan

In [131]:
def classifier(X_train,X_test,Y_train,Y_test):
    #Aplicando KNN
    print(" KNN ")
    ini = time.time() #Inicio do calculo do tempo

    knn=KNeighborsClassifier()
    knn.fit(X_train,Y_train)

    end = time.time()
    knn_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,knn.predict(X_test)))
    print()
    
    #Aplicando Naive Bayes
    print("Naive Bayes")
    ini = time.time() #Inicio do calculo do tempo

    nb=BernoulliNB()
    nb.fit(X_train,Y_train)

    end = time.time()
    nb_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,nb.predict(X_test)))
    print()
    
    #Aplicando Arvore de Decisao
    print("Árvore de Decisão")
    ini = time.time() #Inicio do calculo do tempo

    dtc=tree.DecisionTreeClassifier()
    dtc.fit(X_train,Y_train)

    end = time.time()
    dtc_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,dtc.predict(X_test)))

    #Aplicando MLP
    print("MLP")
    ini = time.time() #Inicio do calculo do tempo

    mlp = MLPClassifier(alpha=1, max_iter=1000)
    mlp.fit(X_train, Y_train)

    end = time.time()
    mlp_time = end - ini # Fim do calculo do tempo

    print(classification_report(Y_test,mlp.predict(X_test)))

    return knn, nb, dtc, mlp

In [132]:
def runModels(relevantFeatures):
    X_train,X_test,Y_train,Y_test=train_test_split(
        dataset[relevantFeatures], dataset['churn'], test_size=0.2, random_state=0
    )

    knn, nb, dtc, mlp = classifier(X_train,X_test,Y_train,Y_test)

runModels(["total day minutes", "total day charge", "customer service calls", "international plan"])

 KNN 
              precision    recall  f1-score   support

       False       0.90      0.96      0.93       579
        True       0.50      0.28      0.36        88

    accuracy                           0.87       667
   macro avg       0.70      0.62      0.64       667
weighted avg       0.85      0.87      0.85       667


Naive Bayes
              precision    recall  f1-score   support

       False       0.87      1.00      0.93       579
        True       0.00      0.00      0.00        88

    accuracy                           0.87       667
   macro avg       0.43      0.50      0.46       667
weighted avg       0.75      0.87      0.81       667


Árvore de Decisão
              precision    recall  f1-score   support

       False       0.91      0.89      0.90       579
        True       0.38      0.44      0.41        88

    accuracy                           0.83       667
   macro avg       0.64      0.67      0.65       667
weighted avg       0.84      0.83   

Como podemos notar, para somente esses atributos, no KNN e no MLP tivemos um mínimo resultado de precisão para false, porém todos ainda muito ruins para prever os casos positivos. Vamos tentar fazer algumas transformações nos atributos para melhorarmos o modelo.

Vamos tentar primeiro binarizar o atributo **number vmail messages**

In [133]:
def binarize(row, attr):
    if(row[attr] >= 1):
        return 1
    return 0

In [134]:
def around(row, attr):    
    return int(np.floor(row[attr]))

In [135]:
dataset["number vmail messages bin"] = dataset.apply (lambda row: binarize(row, "number vmail messages"), axis=1)

#mantém churn na última coluna
dataset["churn"] = dataset.pop("churn")

dataset

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,number vmail messages bin,churn
0,KS,128,415,382-4657,0,0,25,265.1,110,45.07,...,16.78,244.7,91,11.01,10.0,3,2.70,1,1,False
1,OH,107,415,371-7191,0,0,26,161.6,123,27.47,...,16.62,254.4,103,11.45,13.7,3,3.70,1,1,False
2,NJ,137,415,358-1921,0,1,0,243.4,114,41.38,...,10.30,162.6,104,7.32,12.2,5,3.29,0,0,False
3,OH,84,408,375-9999,1,1,0,299.4,71,50.90,...,5.26,196.9,89,8.86,6.6,7,1.78,2,0,False
4,OK,75,415,330-6626,1,1,0,166.7,113,28.34,...,12.61,186.9,121,8.41,10.1,3,2.73,3,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,0,0,36,156.2,77,26.55,...,18.32,279.1,83,12.56,9.9,6,2.67,2,1,False
3329,WV,68,415,370-3271,0,1,0,231.1,57,39.29,...,13.04,191.3,123,8.61,9.6,4,2.59,3,0,False
3330,RI,28,510,328-8230,0,1,0,180.8,109,30.74,...,24.55,191.9,91,8.64,14.1,6,3.81,2,0,False
3331,CT,184,510,364-6381,1,1,0,213.8,105,36.35,...,13.57,139.2,137,6.26,5.0,10,1.35,2,0,False


Analizando então a correlação desse novo atributo, temos

In [136]:
dataset.corr()

Unnamed: 0,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,total eve charge,total night minutes,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,number vmail messages bin,churn
account length,1.0,-0.012463,0.024735,-0.002918,-0.004628,0.006216,0.03847,0.006214,-0.006757,0.01926,-0.006745,-0.008955,-0.013176,-0.00896,0.009514,0.020661,0.009546,-0.003796,0.002918,0.016541
area code,-0.012463,1.0,0.048551,0.000747,-0.001994,-0.008264,-0.009646,-0.008264,0.00358,-0.011886,0.003607,-0.005825,0.016522,-0.005845,-0.018288,-0.024179,-0.018395,0.027572,-0.000747,0.006174
international plan,0.024735,0.048551,1.0,-0.006006,0.008745,0.049396,0.003755,0.049398,0.0191,0.006114,0.019106,-0.028905,0.012451,-0.028913,0.045871,0.017366,0.04578,-0.024522,0.006006,0.259852
voice mail plan,-0.002918,0.000747,-0.006006,1.0,-0.956927,0.001684,0.011086,0.001686,-0.021545,0.006444,-0.021559,-0.006079,-0.015553,-0.006064,0.001318,-0.007618,0.001276,0.017824,-1.0,0.102148
number vmail messages,-0.004628,-0.001994,0.008745,-0.956927,1.0,0.000778,-0.009548,0.000776,0.017562,-0.005864,0.017578,0.007681,0.007123,0.007663,0.002856,0.013957,0.002884,-0.013263,0.956927,-0.089728
total day minutes,0.006216,-0.008264,0.049396,0.001684,0.000778,1.0,0.00675,1.0,0.007043,0.015769,0.007029,0.004323,0.022972,0.0043,-0.010155,0.008033,-0.010092,-0.013423,-0.001684,0.205151
total day calls,0.03847,-0.009646,0.003755,0.011086,-0.009548,0.00675,1.0,0.006753,-0.021451,0.006462,-0.021449,0.022938,-0.019557,0.022927,0.021565,0.004574,0.021666,-0.018942,-0.011086,0.018459
total day charge,0.006214,-0.008264,0.049398,0.001686,0.000776,1.0,0.006753,1.0,0.00705,0.015769,0.007036,0.004324,0.022972,0.004301,-0.010157,0.008032,-0.010094,-0.013427,-0.001686,0.205151
total eve minutes,-0.006757,0.00358,0.0191,-0.021545,0.017562,0.007043,-0.021451,0.00705,1.0,-0.01143,1.0,-0.012584,0.007586,-0.012593,-0.011035,0.002541,-0.011067,-0.012985,0.021545,0.092796
total eve calls,0.01926,-0.011886,0.006114,0.006444,-0.005864,0.015769,0.006462,0.015769,-0.01143,1.0,-0.011423,-0.002093,0.00771,-0.002056,0.008703,0.017434,0.008674,0.002423,-0.006444,0.009233


Pudemos então perceber, que a estratégia de binarização utilizada não foi muito satisfatória para melhorar nosso modelo, apesar de ter melhorado levemente o atributo. Podemos tentar então um arredondamento com os campos fracionários mais relevantes.

In [137]:
dataset["total day minutes round"] = dataset.apply (lambda row: around(row, "total day minutes"), axis=1)
dataset["total day charge round"] = dataset.apply (lambda row: around(row, "total day charge"), axis=1)

#mantém churn na última coluna
dataset["churn"] = dataset.pop("churn")

dataset

Unnamed: 0,state,account length,area code,phone number,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,...,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,number vmail messages bin,total day minutes round,total day charge round,churn
0,KS,128,415,382-4657,0,0,25,265.1,110,45.07,...,91,11.01,10.0,3,2.70,1,1,265,45,False
1,OH,107,415,371-7191,0,0,26,161.6,123,27.47,...,103,11.45,13.7,3,3.70,1,1,161,27,False
2,NJ,137,415,358-1921,0,1,0,243.4,114,41.38,...,104,7.32,12.2,5,3.29,0,0,243,41,False
3,OH,84,408,375-9999,1,1,0,299.4,71,50.90,...,89,8.86,6.6,7,1.78,2,0,299,50,False
4,OK,75,415,330-6626,1,1,0,166.7,113,28.34,...,121,8.41,10.1,3,2.73,3,0,166,28,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,414-4276,0,0,36,156.2,77,26.55,...,83,12.56,9.9,6,2.67,2,1,156,26,False
3329,WV,68,415,370-3271,0,1,0,231.1,57,39.29,...,123,8.61,9.6,4,2.59,3,0,231,39,False
3330,RI,28,510,328-8230,0,1,0,180.8,109,30.74,...,91,8.64,14.1,6,3.81,2,0,180,30,False
3331,CT,184,510,364-6381,1,1,0,213.8,105,36.35,...,137,6.26,5.0,10,1.35,2,0,213,36,False


In [138]:
dataset.corr()

Unnamed: 0,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,total eve calls,...,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,number vmail messages bin,total day minutes round,total day charge round,churn
account length,1.0,-0.012463,0.024735,-0.002918,-0.004628,0.006216,0.03847,0.006214,-0.006757,0.01926,...,-0.013176,-0.00896,0.009514,0.020661,0.009546,-0.003796,0.002918,0.006224,0.005239,0.016541
area code,-0.012463,1.0,0.048551,0.000747,-0.001994,-0.008264,-0.009646,-0.008264,0.00358,-0.011886,...,0.016522,-0.005845,-0.018288,-0.024179,-0.018395,0.027572,-0.000747,-0.008242,-0.007647,0.006174
international plan,0.024735,0.048551,1.0,-0.006006,0.008745,0.049396,0.003755,0.049398,0.0191,0.006114,...,0.012451,-0.028913,0.045871,0.017366,0.04578,-0.024522,0.006006,0.04947,0.048945,0.259852
voice mail plan,-0.002918,0.000747,-0.006006,1.0,-0.956927,0.001684,0.011086,0.001686,-0.021545,0.006444,...,-0.015553,-0.006064,0.001318,-0.007618,0.001276,0.017824,-1.0,0.001792,0.000657,0.102148
number vmail messages,-0.004628,-0.001994,0.008745,-0.956927,1.0,0.000778,-0.009548,0.000776,0.017562,-0.005864,...,0.007123,0.007663,0.002856,0.013957,0.002884,-0.013263,0.956927,0.000662,0.0019,-0.089728
total day minutes,0.006216,-0.008264,0.049396,0.001684,0.000778,1.0,0.00675,1.0,0.007043,0.015769,...,0.022972,0.0043,-0.010155,0.008033,-0.010092,-0.013423,-0.001684,0.999986,0.999522,0.205151
total day calls,0.03847,-0.009646,0.003755,0.011086,-0.009548,0.00675,1.0,0.006753,-0.021451,0.006462,...,-0.019557,0.022927,0.021565,0.004574,0.021666,-0.018942,-0.011086,0.00681,0.006952,0.018459
total day charge,0.006214,-0.008264,0.049398,0.001686,0.000776,1.0,0.006753,1.0,0.00705,0.015769,...,0.022972,0.004301,-0.010157,0.008032,-0.010094,-0.013427,-0.001686,0.999986,0.999522,0.205151
total eve minutes,-0.006757,0.00358,0.0191,-0.021545,0.017562,0.007043,-0.021451,0.00705,1.0,-0.01143,...,0.007586,-0.012593,-0.011035,0.002541,-0.011067,-0.012985,0.021545,0.007078,0.007651,0.092796
total eve calls,0.01926,-0.011886,0.006114,0.006444,-0.005864,0.015769,0.006462,0.015769,-0.01143,1.0,...,0.00771,-0.002056,0.008703,0.017434,0.008674,0.002423,-0.006444,0.015785,0.015894,0.009233


Portanto, também não foi uma estratégia muito relevante. Por fim, uma última tentativa vou remover o número de telefone e encodar também o estado para atributos numéricos

In [139]:
dataset.drop(["phone number"], axis = 1, inplace=True)

label_encoder = preprocessing.LabelEncoder()

dataset['state'] = label_encoder.fit_transform(dataset['state'])
dataset

Unnamed: 0,state,account length,area code,international plan,voice mail plan,number vmail messages,total day minutes,total day calls,total day charge,total eve minutes,...,total night calls,total night charge,total intl minutes,total intl calls,total intl charge,customer service calls,number vmail messages bin,total day minutes round,total day charge round,churn
0,16,128,415,0,0,25,265.1,110,45.07,197.4,...,91,11.01,10.0,3,2.70,1,1,265,45,False
1,35,107,415,0,0,26,161.6,123,27.47,195.5,...,103,11.45,13.7,3,3.70,1,1,161,27,False
2,31,137,415,0,1,0,243.4,114,41.38,121.2,...,104,7.32,12.2,5,3.29,0,0,243,41,False
3,35,84,408,1,1,0,299.4,71,50.90,61.9,...,89,8.86,6.6,7,1.78,2,0,299,50,False
4,36,75,415,1,1,0,166.7,113,28.34,148.3,...,121,8.41,10.1,3,2.73,3,0,166,28,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,3,192,415,0,0,36,156.2,77,26.55,215.5,...,83,12.56,9.9,6,2.67,2,1,156,26,False
3329,49,68,415,0,1,0,231.1,57,39.29,153.4,...,123,8.61,9.6,4,2.59,3,0,231,39,False
3330,39,28,510,0,1,0,180.8,109,30.74,288.8,...,91,8.64,14.1,6,3.81,2,0,180,30,False
3331,6,184,510,1,1,0,213.8,105,36.35,159.6,...,137,6.26,5.0,10,1.35,2,0,213,36,False


In [140]:
#Normalizando os dados de predição na mesma escala
X = dataset.iloc[:,0:19].values
y = dataset.iloc[:,19].values

#normalizando todos os dados
from sklearn.preprocessing import StandardScaler
X_std = StandardScaler().fit_transform(X)
normalized = pd.DataFrame(X_std, index=dataset.index, columns=dataset.columns[0:19])

# adiciona coluna alvo ao dataframe normalizado
normalized['churn'] = dataset['churn']
normalized.head(10)

X = normalized.iloc[:,0:19].values
y = normalized.iloc[:,19].values
X

array([[-0.6786493 ,  0.67648946, -0.52360328, ..., -0.60119509,
        -0.0856905 , -0.42793202],
       [ 0.6031696 ,  0.14906505, -0.52360328, ..., -0.60119509,
         1.2411686 , -0.42793202],
       [ 0.33331299,  0.9025285 , -0.52360328, ...,  0.21153386,
         0.69715637, -1.1882185 ],
       ...,
       [ 0.87302621, -1.83505538,  1.71881732, ...,  0.61789834,
         1.3871231 ,  0.33235445],
       [-1.35329082,  2.08295458,  1.71881732, ...,  2.24335625,
        -1.87695028,  0.33235445],
       [ 1.07541867, -0.67974475, -0.52360328, ..., -0.19483061,
         1.2411686 , -1.1882185 ]])

# Smote

fazemos dados sentéticos apenas no treinamento: SMOTE cria observações sintéticas da classe minoritária (churn) por:

- Encontrando os k-vizinhos mais próximos para observações de classes minoritárias (encontrando observações semelhantes).

- Escolher aleatoriamente um dos k-vizinhos mais próximos e usá-lo para criar uma nova observação semelhante, mas ajustada aleatoriamente.

vamos testar o smote apenas para o conjunto de treinamento

In [141]:
from imblearn.over_sampling import SMOTE

def runModelsWithSmote(X, Y):
    X_train,X_test,Y_train,Y_test=train_test_split(X, Y, test_size=0.2, random_state=0)

    sm = SMOTE(random_state=0, k_neighbors=4)
    X_train_res, Y_train_res = sm.fit_resample(X_train, Y_train)
    X_test_res, Y_test_res = sm.fit_resample(X_test, Y_test)

    knn, nb, dtc, mlp = classifier(X_train_res,X_test_res,Y_train_res,Y_test_res)

runModelsWithSmote(X, Y=y)

 KNN 
              precision    recall  f1-score   support

       False       0.82      0.77      0.79       579
        True       0.78      0.83      0.81       579

    accuracy                           0.80      1158
   macro avg       0.80      0.80      0.80      1158
weighted avg       0.80      0.80      0.80      1158


Naive Bayes
              precision    recall  f1-score   support

       False       0.67      0.65      0.66       579
        True       0.66      0.68      0.67       579

    accuracy                           0.67      1158
   macro avg       0.67      0.67      0.67      1158
weighted avg       0.67      0.67      0.67      1158


Árvore de Decisão
              precision    recall  f1-score   support

       False       0.88      0.89      0.89       579
        True       0.89      0.88      0.89       579

    accuracy                           0.89      1158
   macro avg       0.89      0.89      0.89      1158
weighted avg       0.89      0.89   

Depois da aplicação da técnica de SMOTE os resultados aparentemente ficam bem melhores, para o problema de classes minoritárias. Decision Tree se mostrou mais eficiente para ambas as labels, sendo um algoritmo melhor para o problema.