In [2]:
# importar bibliotecas
import pandas as pd
import numpy as np
from sklearn import preprocessing

# definir o nome das colunas
names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num',
         'marital-status', 'occupation', 'relationship', 'race', 'sex', 
        'capital-gain', 'capital-loss', 'hours-per-week','native-country', 
        'class']


# importar a base e colocar os nomes das colunas
df = pd.read_csv('adult.data', header=None, names=names)

In [3]:
df.keys()

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')

In [4]:
# excluir as colunas que não serão utilizadas no modelo
df = df.drop(['fnlwgt','education-num'], axis=1)

In [5]:
# exlcuir as linhas que não possuem valor em alguma feature
df.dropna(inplace=True)

In [8]:
non_numerical = ['workclass', 'education', 'marital-status', 'occupation', 
'relationship', 'race', 'sex', 'native-country']
le = preprocessing.LabelEncoder()
for x in non_numerical:
    le.fit(df[x])
    df[x] = le.transform(df[x].astype(str))


In [9]:
df

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,39,7,9,4,1,1,4,1,2174,0,40,39,<=50K
1,50,6,9,2,4,0,4,1,0,0,13,39,<=50K
2,38,4,11,0,6,1,4,1,0,0,40,39,<=50K
3,53,4,1,2,6,0,2,1,0,0,40,39,<=50K
4,28,4,9,2,10,5,2,0,0,0,40,5,<=50K
5,37,4,12,2,4,5,4,0,0,0,40,39,<=50K
6,49,4,6,3,8,1,2,0,0,0,16,23,<=50K
7,52,6,11,2,4,0,4,1,0,0,45,39,>50K
8,31,4,12,4,10,1,4,0,14084,0,50,39,>50K
9,42,4,9,2,4,0,4,1,5178,0,40,39,>50K


In [10]:
# criar uma matriz X e o vetor y
X = np.array(df.iloc[:, 0:12]) 	# features
y = np.array(df['class']) 

In [13]:
X

array([[39,  7,  9, ...,  0, 40, 39],
       [50,  6,  9, ...,  0, 13, 39],
       [38,  4, 11, ...,  0, 40, 39],
       ...,
       [58,  4, 11, ...,  0, 40, 39],
       [22,  4, 11, ...,  0, 20, 39],
       [52,  5, 11, ...,  0, 40, 39]])

In [12]:
y

array([' <=50K', ' <=50K', ' <=50K', ..., ' <=50K', ' <=50K', ' >50K'],
      dtype=object)

In [14]:
# criação de intervalo de números ímpares de K para KNN
neighbors = list(range(1,50,2))


# criação de intervalo de f para k-fold
cv_list = list(range(10,40))


# criação da estrutura de listas para armazenar valores
k_list = []
fold_list = []
cv_scores = []

In [None]:
# importar bibliotecas
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

# executar KNN e k-fold cross validation
for k in neighbors:
  for f in cv_list:
      knn = KNeighborsClassifier(n_neighbors=k)
      scores = cross_val_score(knn, X, y, cv=f, scoring='accuracy')
      cv_scores.append(scores.mean())  #popular listas
      k_list.append(k)                 
      fold_list.append(f)

In [None]:
# calcular o erro 
MSE = [1 - x for x in cv_scores]

# contrução do dataframe
df_1 = pd.DataFrame (k_list, columns=['k_list'])
df_2 = pd.DataFrame (fold_list, columns=['fold_list'])
df_3 = pd.DataFrame (MSE, columns=['MSE'])
df_knn = pd.concat([df_1, df_2, df_3], axis=1)

# retorna o menor erro obtido
optimal_k = min(df_knn['MSE'])

# retorna os valores de k e f do menor erro obtido
index_opt = df_knn[df_knn['MSE'] == optimal_k].index.item()


In [None]:
# importar bibliotecas
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

# printar os resultados
print ("The optimal number of neighbors k is %d" % df_knn.loc[index_opt, 'k_list'] )
print ("The optimal number of folds f is %d" % df_knn.loc[index_opt, 'fold_list'] )
print ("Misclassification Error of %f" % optimal_k )

# plotar misclassification error vs k vs f (k-fold)
plt.rcParams['figure.figsize'] = (11,7)
fig = plt.figure()
ax = plt.axes(projection='3d')
color= ['red' if l == index_opt else 'grey' for l in df_knn.index.tolist()]
fc= ['red' if l == index_opt else 'none' for l in df_knn.index.tolist()]
ax.scatter3D(k_list, fold_list, MSE, s=20, facecolor = fc, edgecolors=color, 
depthshade=False)

# nomear o gráfico e os eixos
ax.set_title('Soluções algorítimo KNN')
ax.set_xlabel('k_list')
ax.set_ylabel('fold_list')
ax.set_zlabel('MSE')