# 1.Imports

In [42]:
import pandas as pd
import pickle
import numpy as np
from sklearn import metrics as mt
from sklearn.cluster import AffinityPropagation

# 2. Funcao para carregar os datasets

In [43]:
def Load_Data_Set(df):
    df1 = pd.read_csv(df)
    return df1

# 3. Criando os dataframes

In [44]:
df = "../3_ensaio_clusterizacao/dados_agrupamento/X_dataset.csv"

In [45]:
df1 = Load_Data_Set(df)

In [46]:
df = df1.copy()

In [47]:
display(df)

Unnamed: 0,alcohol,malic_acid,ash,ash_alcanity,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280,proline
0,1.518613,0.191700,0.232053,-1.169593,1.913905,0.627586,0.573840,-0.659563,1.224884,0.251717,0.455285,0.970696,0.561341
1,0.246290,0.205534,-0.827996,-2.490847,0.018145,0.575862,0.510549,-0.820719,-0.544721,-0.293321,0.463415,0.780220,0.550642
2,0.196879,0.320158,1.109334,-0.268738,0.088358,0.627586,0.611814,-0.498407,2.135968,0.269020,0.447154,0.695971,0.646933
3,1.691550,0.239130,0.487926,-0.809251,0.930918,0.989655,0.664557,-0.981875,1.032155,1.186068,0.308943,0.798535,0.857347
4,0.295700,0.365613,1.840403,0.451946,1.281985,0.627586,0.495781,0.226796,0.401404,-0.319276,0.455285,0.608059,0.325963
...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,0.876275,0.970356,0.305159,0.301803,-0.332922,0.241379,0.056962,1.274310,-0.930179,1.142811,0.130081,0.172161,0.329529
174,0.493343,0.626482,0.414820,1.052516,0.158572,0.282759,0.086498,0.549108,-0.316950,0.969783,0.178862,0.106227,0.336662
175,0.332758,0.699605,-0.389355,0.151661,1.422412,0.210345,0.073840,0.549108,-0.422075,2.224236,0.089431,0.106227,0.397290
176,0.209232,0.365613,0.012732,0.151661,1.422412,0.231034,0.071730,1.354888,-0.229346,1.834923,0.097561,0.128205,0.400856


# 4. Função para treinar o modelo e avaliar as métricas dele

In [48]:
#Função para Treinar o Modelo encima dos dados disponiveis e auferir as métricas

#Entrada: dataframe, parametro preference

#Saida: dataframe que contém o preference e a metrica obtida no treinamento

def Model_Training(df, preference): 
    
    features = ['alcohol', 'malic_acid', 'ash', 'ash_alcanity', 'magnesium',
       'total_phenols', 'flavanoids', 'nonflavanoid_phenols',
       'proanthocyanins', 'color_intensity', 'hue', 'od280', 'proline']
    
       
    #Preparação dos dados de treino
    X = df.loc[:, features]     
    
    #Treinamento do algoritmo Affinity Propagation voltado para clusterizacao 
    
    #No caso desse algoritmo precisamos ajustar o parametro preferences, que influencia as possibilidades de um determinado
    #ponto encontrar os demais como exemplos de seu cluster.
    
    #O parametro a quantidade de vezes (iteracoes )que os pontos procuram encontrar semelhança com os demais pontos
    
    
    affinity_propagation = AffinityPropagation(preference=preference)
    
    #Fazendo previsoes
    labels = affinity_propagation.fit_predict(X)
    

    #Performance do Algoritmo
    ss_avg = np.round(mt.silhouette_score(X, labels),3)
    
    #Montando um df com a metrica     
    
    d = { 'Values': [preference,ss_avg]}
    
    df_metrics =  pd.DataFrame(data = d, index = ['n_clusters', 'silhouette_score'])
    
    return df_metrics
   

# 5.Treinamento do modelo com diferentes parametros n_clusters e verificação das metricas obtidas



### 5.1 Parametro preference por default = None

In [49]:
df_default = Model_Training(df, None)
df_default

Unnamed: 0,Values
n_clusters,
silhouette_score,0.169


### 5.2 preferences = -8 

In [50]:
df_preferences8 = Model_Training(df, -8)
df_preferences8

Unnamed: 0,Values
n_clusters,-8.0
silhouette_score,0.168


### 5.3 preferences = -18

In [51]:
df_preferences18 = Model_Training(df, -18)
df_preferences18

Unnamed: 0,Values
n_clusters,-18.0
silhouette_score,0.155


### 5.4 preferences = -28

In [52]:
df_preferences28 = Model_Training(df, -28)
df_preferences28

Unnamed: 0,Values
n_clusters,-28.0
silhouette_score,0.159


### 5.5 preferences = -38

In [53]:
df_preferences38 = Model_Training(df, -38)
df_preferences38

Unnamed: 0,Values
n_clusters,-38.0
silhouette_score,0.188


### 5.6 preferences = -48

In [54]:
df_preferences48 = Model_Training(df, -48)
df_preferences48

Unnamed: 0,Values
n_clusters,-48.0
silhouette_score,0.202


### 5.7 preferences = -55

In [55]:
df_preferences55 = Model_Training(df, -55)
df_preferences55

Unnamed: 0,Values
n_clusters,-55.0
silhouette_score,0.201


### 5.8 preferences = -60

In [56]:
df_preferences60 = Model_Training(df, -60)
df_preferences60

Unnamed: 0,Values
n_clusters,-60.0
silhouette_score,0.183


### 5.9 preferences = -88

In [57]:
df_preferences88 = Model_Training(df, -88)
df_preferences88

Unnamed: 0,Values
n_clusters,-88.0
silhouette_score,0.159


### 5.10 preferences = -68

In [58]:
df_preferences68 = Model_Training(df, -68)
df_preferences68

Unnamed: 0,Values
n_clusters,-68.0
silhouette_score,0.173


# 6. Concatenando os dataframes 

In [59]:
df_final = pd.concat([df_default, df_preferences8, df_preferences18, df_preferences28,
                     df_preferences38, df_preferences48, df_preferences60,
                     df_preferences68, df_preferences88], axis = 1)
df_final

Unnamed: 0,Values,Values.1,Values.2,Values.3,Values.4,Values.5,Values.6,Values.7,Values.8
n_clusters,,-8.0,-18.0,-28.0,-38.0,-48.0,-60.0,-68.0,-88.0
silhouette_score,0.169,0.168,0.155,0.159,0.188,0.202,0.183,0.173,0.159


# 7. Salvando o dataframe que contém o algoritmo treinado com a melhor performance

In [62]:
df_preferences48 = df_preferences48.rename(columns ={"Values": "Affinity Propagation"})
df_preferences48

Unnamed: 0,Affinity Propagation
n_clusters,-48.0
silhouette_score,0.202


In [63]:
with open('arquivo_affinity_propagation.pkl', 'wb') as arquivo_affinity_propagation:
    pickle.dump(df_preferences48,arquivo_affinity_propagation) 

In [64]:
arquivo_affinity_propagation.close()