In [None]:
#!pip install pandas seaborn scikit-learn matplotlib scipy ipywidgets ipykernel optuna streamlit

In [None]:
#!pip install plotly

In [5]:
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.cluster import AgglomerativeClustering, BisectingKMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
from scipy.cluster.hierarchy import dendrogram, linkage, cut_tree 

import optuna

In [6]:
df = pd.read_csv('./datasets/laptops.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 991 entries, 0 to 990
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   index                     991 non-null    int64  
 1   brand                     991 non-null    object 
 2   model                     991 non-null    object 
 3   price                     991 non-null    float64
 4   rating                    991 non-null    int64  
 5   processor_brand           991 non-null    object 
 6   processor_tier            991 non-null    object 
 7   num_cores                 991 non-null    int64  
 8   num_threads               991 non-null    int64  
 9   ram_memory                991 non-null    int64  
 10  primary_storage_type      991 non-null    object 
 11  primary_storage_capacity  991 non-null    int64  
 12  gpu_brand                 991 non-null    object 
 13  gpu_type                  991 non-null    object 
 14  is_touch_s

In [8]:
df.isnull().sum()

index                       0
brand                       0
model                       0
price                       0
rating                      0
processor_brand             0
processor_tier              0
num_cores                   0
num_threads                 0
ram_memory                  0
primary_storage_type        0
primary_storage_capacity    0
gpu_brand                   0
gpu_type                    0
is_touch_screen             0
display_size                0
resolution_width            0
resolution_height           0
os                          0
year_of_warranty            0
dtype: int64

In [9]:
df.describe()

Unnamed: 0,index,price,rating,num_cores,num_threads,ram_memory,primary_storage_capacity,display_size,resolution_width,resolution_height
count,991.0,991.0,991.0,991.0,991.0,991.0,991.0,991.0,991.0,991.0
mean,505.860747,926.687487,63.931382,8.128153,12.191726,13.047427,610.938446,15.171241,2003.503532,1181.227043
std,287.899458,688.239764,10.190575,4.215499,5.585115,5.591188,266.927666,0.938089,361.965292,263.884019
min,1.0,117.54,24.0,2.0,0.0,2.0,32.0,10.1,1080.0,768.0
25%,258.5,522.85,58.0,6.0,8.0,8.0,512.0,14.0,1920.0,1080.0
50%,507.0,742.39,64.0,8.0,12.0,16.0,512.0,15.6,1920.0,1080.0
75%,754.5,1070.35,71.0,10.0,16.0,16.0,512.0,15.6,1920.0,1200.0
max,1002.0,5450.88,89.0,24.0,32.0,36.0,2048.0,18.0,3840.0,2560.0


In [None]:
df.gpu_type.unique()

In [None]:
df.gpu_brand.unique()

In [None]:
df.os.unique()

In [None]:
df.os.value_counts()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df[df.os == 'mac'].model.unique()

In [None]:
df[df.os == 'other'].model

In [None]:
sns.histplot(data=df, x=df.os)

In [None]:
sns.histplot(data=df, x=df.price)

In [None]:
sns.histplot(data=df, x=df.rating, color='red', kde=True)

In [None]:
sns.scatterplot(data=df, x=df.price, y=df.num_cores)

In [None]:
df.year_of_warranty.unique()

In [None]:
df.loc[df['year_of_warranty'] == "No information", 'year_of_warranty'] = 1
df['year_of_warranty'] = df['year_of_warranty'].astype(int)

In [None]:
df.info()

In [None]:
df.year_of_warranty.unique()

In [None]:
df['is_touch_screen'] = df['is_touch_screen'].astype(int)

In [None]:
df.info()

In [None]:
percentual_brand = df.value_counts('brand') / len(df) * 100
plt.figure(figsize=[20, 10])
barplot = sns.barplot(percentual_brand, palette='viridis')

for p in barplot.patches:
    barplot.annotate(f'{p.get_height()}', 
                     (p.get_x() + p.get_width() / 2., p.get_height()), 
                     ha='center', va='bottom', 
                     fontsize=12, color='black')

plt.show()

In [None]:
plt.figure(figsize=[10, 10])
sns.boxplot(x=df.price, y=df.brand)

In [None]:
df[df['brand'] == 'apple'].price.unique()

In [None]:
sns.histplot(data=df[df['brand'] == 'apple'].price)

In [None]:
plt.figure(figsize=[10, 10])
sns.scatterplot(data=df, x=df.price, y=df.rating, hue=df.brand)

## Treinar modelo

In [None]:
X = df.copy()

X.drop(columns=['index', 'model'], inplace=True)

In [None]:
X.info()

In [None]:
numeric_features = ['price', 'rating', 'num_cores', 'num_threads', 'ram_memory', 'primary_storage_capacity',
                    'display_size', 'resolution_width', 'resolution_height']
categorical_features = ['brand', 'processor_brand', 'primary_storage_type', 'gpu_brand',
                        'os']

In [None]:
numeric_transformer = StandardScaler()
categorical_tranformer = OneHotEncoder()

In [None]:
preprocessor = ColumnTransformer(
  transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_tranformer, categorical_features)
  ]
)

In [None]:
X_transformed = preprocessor.fit_transform(X)

In [None]:
X_transformed

In [None]:
def hierarchical_aglomerative_objective(trial):
  n_clusters = trial.suggest_int('n_clusters', 10, 150)
  # Linkage = critério de distância entre dois conjuntos para formar os clusters
  # Ward usa a variancia
  # Average usa a média
  # Complete usa a máxima
  # Single usa a mínima
  linkage = trial.suggest_categorical('linkage', ['ward', 'average', 'complete', 'single'])
  
  # Intancia o modelo
  hierarchical_model = AgglomerativeClustering(linkage=linkage, n_clusters=n_clusters)
  
  # Treinar o modelo e executar a clusterização
  y = hierarchical_model.fit_predict(X_transformed.toarray())
  
  # Calcular o SilhoueteScore
  silhouette_avg = silhouette_score(X_transformed, y)
  
  return silhouette_avg

In [None]:
# Criar estudo no optuna
search_space_ag = {'n_clusters': range(10, 151), 'linkage': ['ward', 'average', 'complete', 'single']}
sampler_ag = optuna.samplers.GridSampler(search_space=search_space_ag)

estudo_ag = optuna.create_study(direction='maximize', sampler=sampler_ag)

In [None]:
# Executar o estudo
estudo_ag.optimize(hierarchical_aglomerative_objective, n_trials=600)

In [None]:
best_params_ag = estudo_ag.best_params
best_params_ag

In [None]:
def hierarchical_divisive_objective(trial):
  n_clusters = trial.suggest_int('n_clusters', 10, 150)
  
  # Intancia o modelo
  hierarchical_model = BisectingKMeans(n_clusters=n_clusters)
  
  # Treinar o modelo e executar a clusterização
  y = hierarchical_model.fit_predict(X_transformed.toarray())
  
  # Calcular o SilhoueteScore
  silhouette_avg = silhouette_score(X_transformed, y)
  
  return silhouette_avg

In [None]:
# Criar estudo no optuna
search_space_di = {'n_clusters': range(10, 151)}
sampler_di = optuna.samplers.GridSampler(search_space=search_space_di)

estudo_di = optuna.create_study(direction='maximize', sampler=sampler_di)

In [None]:
# Executar o estudo
estudo_di.optimize(hierarchical_divisive_objective, n_trials=200)

In [None]:
best_params_di = estudo_di.best_params
best_params_di

In [None]:
best_model = AgglomerativeClustering(
  n_clusters=149,
  linkage='ward'
)

In [None]:
best_model.fit(X_transformed.toarray())

In [None]:
len(best_model.labels_)

In [None]:
best_score = silhouette_score(X_transformed, best_model.labels_)
best_score

In [None]:
# Colocar labels no Dataset original
df['clusters'] = best_model.labels_

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df[df['clusters'] == 35]

In [None]:
df[df['clusters'] == 130]

In [None]:
import plotly.express as px
import plotly.graph_objects as go  

In [None]:
!pipenv install nbformat

In [None]:
modelo_de = linkage(X_transformed.toarray(), method=best_params_ag['linkage'], optimal_ordering=True)
len(X_transformed.toarray())

In [None]:
plt.figure(figsize=(30, 18))
dendrogram(modelo_de, truncate_mode='lastp', p=50, leaf_rotation=90, leaf_font_size=10)
plt.title('Dendrogram Aglomerative Hierarchical Clustering')
plt.xlabel('Tamanho do Cluster')
plt.ylabel('Distância')
plt.show()