## Load packages & data

### Packages

In [None]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
import tkinter
import matplotlib
import matplotlib.pyplot as plt
matplotlib.use('TkAgg')

### Data

In [None]:
path = "~/Documents/jupyter-workplace/data-mining-assignment/reformed_all_data.csv"
data = pd.read_csv(path, index_col=0)

path_cbo = "~/Documents/jupyter-workplace/data-mining-assignment/CBO2002.csv"
cbo = pd.read_csv(path_cbo, index_col=0, encoding = 'unicode_escape', engine ='python')

In [None]:
data.head(n = 10)

# Explorative Analysis

In [None]:
# stratification

data_branca = data[(data['RACACOR'] == 'Branca')]
data_preta = data[(data['RACACOR'] == 'Preta')]
data_pardo = data[(data['RACACOR'] == 'Parda')]

# juntar gestates pretas e pardas
frames = [data_pardo,data_preta]
data_pp =  pd.concat(frames)

In [None]:
# dimensões
print("Total:",data.shape, "\n")
print("Gestantes brancas:",data_branca.shape, "\n")
print("Gestantes pretas ou pardas:",data_pp.shape, "\n")

-----------
## Descreptive analysis

**Perfil da gestante**
- Perfil de ocupação/escolaridade/etnia

**Perfil do desfecho ao nascimento**
- APGAR
- Prematuridade

---------------------------

In [None]:
# função frequency report
def frequency_report(data,parameter):
    most_frequent = data[parameter].value_counts()
    temp = most_frequent.head(n = 20)
    
    titulos = list(temp.index)
    counts = list(temp)
    percent = list(round(temp/sum(most_frequent)*100,2))
    zipped = list(zip(titulos,counts,percent))
    
    df_ocupation = pd.DataFrame(data = zipped, columns= ["Titulo","Count","Porcentagem (%)"])
    
    return(df_ocupation)

In [None]:
# geral
frequency_report(data = data, parameter = 'CODOCUPMAE')

In [None]:
# Gestante branca
frequency_report(data = data_branca, parameter = 'CODOCUPMAE')

In [None]:
# Gestante preta e parda
frequency_report(data = data_pp, parameter = 'CODOCUPMAE')

In [None]:
#sns.set_theme(style="darkgrid")

#ax = sns.catplot(y="Titulo", x = "Porcentagem (%)", kind="bar", data=df_ocupation)

#plt.show()

## Clustering

***há agrupamentos nos dados de perfil materno?***

### Preprocessing

In [None]:
# perfil materno
perfil_materno = ['ESCMAE','RACACOR','CODOCUPMAE']

In [None]:
# selecionando brancas, pardas e pretas
data_cluster = data[((data['RACACOR'] == 'Parda') | (data['RACACOR'] == 'Preta') | (data['RACACOR'] == 'Branca'))]

data_cluster = data_cluster.reset_index()

data_cluster = data_cluster.drop("index", axis=1)

In [None]:
# substituir preta e pardar por preta/parda
data_cluster['RACACOR'] = data_cluster['RACACOR'].map({'Branca':'Branca','Preta':'Preta/Parda',
                                                       'Parda':'Preta/Parda'})
raca_dtype = pd.CategoricalDtype(categories=['Branca', 'Preta/Parda'], 
                                   ordered=False)
data_cluster['RACACOR'].astype(raca_dtype)

In [None]:
# only relevant ocupations
table1 = frequency_report(data = data, parameter = 'CODOCUPMAE')
table2 = frequency_report(data = data_branca, parameter = 'CODOCUPMAE')
table3 = frequency_report(data = data_pp, parameter = 'CODOCUPMAE')

table = list(table1["Titulo"])+list(table2["Titulo"])+list(table3["Titulo"])

top_ocupations = list(set(table))

data_cluster = data_cluster[data_cluster['CODOCUPMAE'].isin(top_ocupations)]

### Sampling, Distance Matrix and K-Medoid

In [689]:
samp = data_cluster.sample(n = 500)
samp = samp.reset_index()

samp['RACACOR'].value_counts()

Branca         277
Preta/Parda    223
Name: RACACOR, dtype: int64

In [632]:
import time
import Categorical_similarity_measures as sim
import kmedoids

from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist, jaccard

def sim_to_dist(x):
    return 1-x

#### Jaccard distance

In [1071]:
from sklearn import preprocessing
# codificar elementos antes de rodar o jaccard

ocupationenc = preprocessing.LabelEncoder()
ocupationenc.fit(samp['CODOCUPMAE'])
enc_ocupation_mapping = dict(zip(ocupationenc.classes_, ocupationenc.transform(ocupationenc.classes_)))
samp['CODOCUPMAE'] = ocupationenc.transform(samp['CODOCUPMAE'])

racialenc = preprocessing.LabelEncoder()
racialenc.fit(samp['RACACOR'])
enc_racial_mapping = dict(zip(racialenc.classes_, racialenc.transform(racialenc.classes_)))
samp['RACACOR'] = racialenc.transform(samp['RACACOR'])

escolenc = preprocessing.LabelEncoder()
escolenc.fit(samp['ESCMAE'])
enc_escolaridade_mapping = dict(zip(escolenc.classes_, escolenc.transform(escolenc.classes_)))
samp['ESCMAE'] = escolenc.transform(samp['ESCMAE'])


In [1072]:
# Jaccard

start = time.time() # start timer
distmat_jaccard = pdist(samp[perfil_materno], 'jaccard')
distmat_jaccard = pd.DataFrame(squareform(distmat_jaccard), index=samp.index,
                               columns= samp.index)
end = time.time() # end timer

print("time elapsed: ", (end - start)/60, " minutos")

time elapsed:  0.00013199647267659505  minutos


In [1073]:
# decodificar elementos fatorizados

samp['CODOCUPMAE'] = list(ocupationenc.inverse_transform(samp['CODOCUPMAE']))
samp['RACACOR'] = list(racialenc.inverse_transform(samp['RACACOR']))
samp['ESCMAE'] = list(escolenc.inverse_transform(samp['ESCMAE']))

#### Overlap similarity

**otimizar calculo do overlap** para aumentar amostragem!

In [1064]:
# Overlap

start = time.time() # start timer
#similarity and dissimilarity
similarity = sim.Overlap(samp[perfil_materno])
similarity_df = pd.DataFrame(similarity)

distmat_overlap = pd.DataFrame(similarity)
distmat_overlap = distmat_overlap.apply(sim_to_dist)
# diagonal = 0
distmat_overlap = np.array(distmat_overlap)
np.fill_diagonal(distmat_overlap, 0)
distmat_overlap = pd.DataFrame(distmat_overlap)
end = time.time() # end timer

print("time elapsed: ", (end - start)/60, " minutos")

time elapsed:  0.2537096301714579  minutos


#### Clustering

In [1074]:
# jaccard
start = time.time() # start timer
jaccard_c = kmedoids.fasterpam(distmat_jaccard, 4, max_iter = 10, random_state= 5)
end = time.time() # end timer

print(jaccard_c.n_iter,"\n")
jaccard_c.loss

4 



191.66666666666703

In [1066]:
# overlap 
start = time.time() # start timer
overlap_c = kmedoids.fasterpam(distmat_overlap, 4, max_iter = 10, random_state= 2)
end = time.time() # end timer

print(overlap_c.n_iter,"\n")
overlap_c.loss

3 



0.9999999999999998

**save** result in a dataframe

In [1076]:
# data clust (jaccard)
cluster_jaccard = pd.DataFrame(data = {'Ocupação': samp['CODOCUPMAE'], 'Escolaridade': samp['ESCMAE'],
                                       'Raça': samp['RACACOR'], 'Cluster': jaccard_c.labels})

# data clust (overlap)
cluster_overlap = pd.DataFrame(data = {'Ocupação': samp['CODOCUPMAE'], 'Escolaridade': samp['ESCMAE'],
                                       'Raça': samp['RACACOR'], 'Cluster': overlap_c.labels})


In [1077]:
cluster_jaccard.sort_values(by=['Cluster'])

Unnamed: 0,Ocupação,Escolaridade,Raça,Cluster
249,Professor da educação de jovens e adultos do ...,8 a 11 anos,Preta/Parda,0
245,Operador de telemarketing ativo e receptivo,8 a 11 anos,Preta/Parda,0
254,Vendedor de comércio varejista,4 a 7 anos,Preta/Parda,0
256,Empregado doméstico nos serviços gerais,8 a 11 anos,Preta/Parda,0
257,Operador de caixa,8 a 11 anos,Preta/Parda,0
...,...,...,...,...
227,Cabeleireiro,12 e mais,Preta/Parda,3
231,Recepcionista,12 e mais,Preta/Parda,3
379,Manicure,12 e mais,Preta/Parda,3
458,Auxiliar de enfermagem,12 e mais,Preta/Parda,3


### Results evaluation

apresentar **siluette plot** para avaliação da qualidadde de clusterização!

ver caracteristicas de cada cluster por **tabela**, **barplot** e heatmap da **matriz de distancias organizado por cluster**!

In [1079]:
#matriz de distancia separada por cluster
reorder = cluster_overlap.sort_values(by=['Cluster']).index

distmat_overlap = distmat_overlap.reindex(reorder)
distmat_overlap = distmat_overlap.reindex(columns = reorder)

#new_id = list(cluster_overlap.sort_values(by=['Cluster'])['Cluster'])

#distmat_overlap.index.name = new_id
#distmat_overlap.columns.name = new_id

sns.heatmap(data = distmat_overlap)

plt.show()

In [1078]:
#matriz de distancia separada por cluster
reorder = cluster_jaccard.sort_values(by=['Cluster']).index

distmat_jaccard = distmat_jaccard.reindex(reorder)
distmat_jaccard = distmat_jaccard.reindex(columns = reorder)

sns.heatmap(data = distmat_jaccard)

plt.show()

In [1021]:
from sklearn.metrics import silhouette_samples, silhouette_score
# sillhuete plot (distance matrix + cluster id)


#view cluster details: table and barplot