In [None]:
# Mineria de datos
# Laboratorio 2
# Anggelie Velasquez 221181
# Anthony Lou 23410
# Isabella Obando 23074

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import pdist
from sklearn.neighbors import NearestNeighbors

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("movies_2026.csv", encoding="latin1")
df.head()

Unnamed: 0,id,budget,genres,homePage,productionCompany,productionCompanyCountry,productionCountry,revenue,runtime,video,...,releaseDate,voteAvg,voteCount,genresAmount,productionCoAmount,productionCountriesAmount,actorsAmount,castWomenAmount,castMenAmount,releaseYear
0,1627085,0.0,Drama|Crime,,,,,0.0,95,False,...,2026-02-01,0.0,0,2,0,0,8,2.0,5.0,2026.0
1,1626914,0.0,Animation,,,,,0.0,3,False,...,2026-02-01,0.0,0,1,0,0,4,0.0,0.0,2026.0
2,1626898,0.0,Animation,,,,,0.0,2,False,...,2026-02-01,0.0,0,1,0,0,3,0.0,0.0,2026.0
3,1626808,0.0,Thriller|Mystery|Documentary,,,,,0.0,5,False,...,2026-02-01,0.0,0,3,0,0,7,0.0,0.0,2026.0
4,1626678,0.0,Animation,,,,,0.0,12,False,...,2026-02-01,0.0,0,1,0,0,3,0.0,0.0,2026.0


In [4]:
df.columns

Index(['id', 'budget', 'genres', 'homePage', 'productionCompany',
       'productionCompanyCountry', 'productionCountry', 'revenue', 'runtime',
       'video', 'director', 'actors', 'actorsPopularity', 'actorsCharacter',
       'originalTitle', 'title', 'originalLanguage', 'popularity',
       'releaseDate', 'voteAvg', 'voteCount', 'genresAmount',
       'productionCoAmount', 'productionCountriesAmount', 'actorsAmount',
       'castWomenAmount', 'castMenAmount', 'releaseYear'],
      dtype='str')

In [5]:
df = pd.read_csv("movies_2026.csv", encoding="latin1")

df_cluster = df.drop(columns=[
    'id', 'genres', 'homePage', 'productionCompany',
    'productionCompanyCountry', 'productionCountry',
    'director', 'actors', 'actorsCharacter',
    'originalTitle', 'title', 'originalLanguage',
    'releaseDate', 'video',
    'budget', 'revenue',
    'actorsAmount', 'castWomenAmount', 'castMenAmount', 'actorsPopularity'
], errors='ignore')

df_cluster = df_cluster.select_dtypes(include=['int64','float64'])

In [6]:
df_cluster.isnull().sum()

runtime                      0
popularity                   0
voteAvg                      0
voteCount                    0
genresAmount                 0
productionCoAmount           0
productionCountriesAmount    0
releaseYear                  2
dtype: int64

In [7]:
df_cluster = df_cluster.replace([np.inf, -np.inf], np.nan)
df_cluster = df_cluster.fillna(df_cluster.median(numeric_only=True))

In [8]:
df_cluster.drop(columns=['budget','revenue'], errors='ignore', inplace=True)

In [9]:
# Reemplazar ceros en runtime con la mediana
median_runtime = df_cluster.loc[df_cluster['runtime'] > 0, 'runtime'].median()
df_cluster.loc[df_cluster['runtime'] == 0, 'runtime'] = median_runtime

Se eliminaron las columnas de budget y revenue ya que tenian muchos datos en 0, no se van a tomar en cuenta en el clustering

In [10]:
# Transformaciones logaritmicas para reducir sesgo
df_cluster['popularity'] = np.log1p(df_cluster['popularity'])
df_cluster['voteCount'] = np.log1p(df_cluster['voteCount'])
df_cluster['productionCoAmount'] = np.log1p(df_cluster['productionCoAmount'])
df_cluster['productionCountriesAmount'] = np.log1p(df_cluster['productionCountriesAmount'])

In [11]:
zeros_percentage = (df_cluster == 0).mean() * 100
zeros_percentage = zeros_percentage.sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=zeros_percentage.values, y=zeros_percentage.index)
plt.axvline(x=20, color='red', linestyle='--', label='Umbral 20%')
plt.xlabel("Porcentaje de valores en 0 (%)")
plt.ylabel("Variables")
plt.title("Porcentaje de valores en 0 por variable")
plt.legend()
plt.show()

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_cluster)

In [13]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

def hopkins(X):
    n, d = X.shape
    m = int(0.05 * n)
    nbrs = NearestNeighbors(n_neighbors=2).fit(X)
    rand_X = np.random.uniform(np.min(X, axis=0), np.max(X, axis=0), (m, d))
    u_dist, w_dist = [], []
    for j in range(m):
        u = rand_X[j].reshape(1, -1)
        w = X[np.random.randint(0, n)].reshape(1, -1)
        u_dist.append(nbrs.kneighbors(u, 2, return_distance=True)[0][0][1])
        w_dist.append(nbrs.kneighbors(w, 2, return_distance=True)[0][0][1])
    return np.sum(u_dist) / (np.sum(u_dist) + np.sum(w_dist))

print(f"Estadístico de Hopkins: {hopkins(X_scaled):.4f}")

Estadístico de Hopkins: 0.9687


In [14]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

plt.figure(figsize=(8,5))
plt.plot(K_range, inertia, marker='o')
plt.xlabel("Número de Clusters")
plt.ylabel("Inercia")
plt.title("Método del Codo")
plt.show()

In [15]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)
df_cluster['Cluster'] = clusters

In [16]:
from sklearn.metrics import silhouette_score

sil_score = silhouette_score(X_scaled, clusters)
print(f"Silhouette Score (K=3): {sil_score:.4f}")

Silhouette Score (K=3): 0.3282


In [17]:
df_cluster.groupby("Cluster").mean()

Unnamed: 0_level_0,runtime,popularity,voteAvg,voteCount,genresAmount,productionCoAmount,productionCountriesAmount,releaseYear
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,14.291979,0.105104,0.834917,0.086877,1.198105,0.29253,0.4234,2025.169017
1,101.655874,3.310728,6.509777,5.900776,2.613327,1.30927,0.884092,2008.801185
2,93.020518,0.242783,1.257823,0.189858,1.312943,0.495228,0.480952,2025.160307


In [18]:
from sklearn.decomposition import PCA

pca_viz = PCA(n_components=2)
X_pca_viz = pca_viz.fit_transform(X_scaled)

print("Varianza explicada:", pca_viz.explained_variance_ratio_)
print("Varianza acumulada:", sum(pca_viz.explained_variance_ratio_))

Varianza explicada: [0.58052619 0.10184872]
Varianza acumulada: 0.6823749150275165


In [19]:
plt.figure(figsize=(8,6))
sns.scatterplot(
    x=X_pca_viz[:,0], y=X_pca_viz[:,1],
    hue=df_cluster["Cluster"], palette="Set1"
)
plt.title("Visualización de Clusters usando PCA")
plt.xlabel("Componente Principal 1")
plt.ylabel("Componente Principal 2")
plt.legend()
plt.show()

### Matriz de Correlación

In [20]:
# Trabajamos con las variables originales (sin la columna Cluster)
df_pca = df_cluster.drop(columns=['Cluster'])

corr_matrix = df_pca.corr()

plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(
    corr_matrix, annot=True, fmt='.3f', cmap='RdYlGn',
    center=0, vmin=-1, vmax=1, square=True,
    linewidths=0.5, cbar_kws={'label': 'Correlación de Pearson'}
)
plt.title('Matriz de Correlación - Variables de Películas', fontsize=14, pad=15)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

print("\nCorrelación completa:")
print(corr_matrix.round(3))


Correlación completa:
                           runtime  popularity  voteAvg  voteCount  \
runtime                      1.000       0.519    0.474      0.550   
popularity                   0.519       1.000    0.759      0.885   
voteAvg                      0.474       0.759    1.000      0.784   
voteCount                    0.550       0.885    0.784      1.000   
genresAmount                 0.308       0.519    0.475      0.535   
productionCoAmount           0.481       0.654    0.579      0.684   
productionCountriesAmount    0.300       0.451    0.387      0.420   
releaseYear                 -0.386      -0.544   -0.538     -0.666   

                           genresAmount  productionCoAmount  \
runtime                           0.308               0.481   
popularity                        0.519               0.654   
voteAvg                           0.475               0.579   
voteCount                         0.535               0.684   
genresAmount                   

### Prueba KMO y Prueba de Esfericidad de Bartlett

In [21]:
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity, calculate_kmo

# KMO
kmo_all, kmo_model = calculate_kmo(df_pca)
print("=== Prueba de KMO ===")
print(f"KMO Global: {kmo_model:.4f}")
print("\nKMO por variable:")
for col, val in zip(df_pca.columns, kmo_all):
    print(f"  {col:35s}: {val:.4f}")

# Bartlett
chi2, p = calculate_bartlett_sphericity(df_pca)
print(f"\n=== Prueba de Esfericidad de Bartlett ===")
print(f"Chi-cuadrado: {chi2:.2f}")
print(f"p-valor:      {p:.2e}")
print(f"Resultado:    {'Se rechaza H0 - PCA es adecuado' if p < 0.05 else 'No se rechaza H0'}")

=== Prueba de KMO ===
KMO Global: 0.8898

KMO por variable:
  runtime                            : 0.9681
  popularity                         : 0.8559
  voteAvg                            : 0.9512
  voteCount                          : 0.8100
  genresAmount                       : 0.9716
  productionCoAmount                 : 0.9132
  productionCountriesAmount          : 0.9101
  releaseYear                        : 0.8698

=== Prueba de Esfericidad de Bartlett ===
Chi-cuadrado: 97505.58
p-valor:      0.00e+00
Resultado:    Se rechaza H0 - PCA es adecuado


### Componentes Principales — Eigenvalores y Varianza Explicada

In [22]:
from sklearn.decomposition import PCA

scaler_pca = StandardScaler()
X_pca_scaled = scaler_pca.fit_transform(df_pca)

pca_full = PCA()
pca_full.fit(X_pca_scaled)

eigenvalues = pca_full.explained_variance_
variance_ratio = pca_full.explained_variance_ratio_
cumulative = np.cumsum(variance_ratio)

# Tabla resumen
summary = pd.DataFrame({
    'Eigenvalor': eigenvalues.round(4),
    'Varianza (%)': (variance_ratio * 100).round(2),
    'Varianza Acumulada (%)': (cumulative * 100).round(2)
}, index=[f'PC{i+1}' for i in range(len(eigenvalues))])
print(summary)

# Scree plot
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Eigenvalores
axes[0].bar(range(1, len(eigenvalues)+1), eigenvalues, color='steelblue', alpha=0.8, edgecolor='black')
axes[0].axhline(y=1, color='red', linestyle='--', linewidth=1.5, label='Criterio Kaiser (λ=1)')
axes[0].set_xlabel('Componente Principal')
axes[0].set_ylabel('Eigenvalor')
axes[0].set_title('Scree Plot — Eigenvalores')
axes[0].legend()
axes[0].set_xticks(range(1, len(eigenvalues)+1))

# Varianza acumulada
axes[1].plot(range(1, len(cumulative)+1), cumulative*100, marker='o', color='steelblue', linewidth=2)
axes[1].axhline(y=70, color='orange', linestyle='--', label='70% varianza')
axes[1].axhline(y=80, color='red', linestyle='--', label='80% varianza')
axes[1].set_xlabel('Número de Componentes')
axes[1].set_ylabel('Varianza Acumulada (%)')
axes[1].set_title('Varianza Acumulada Explicada')
axes[1].legend()
axes[1].set_xticks(range(1, len(cumulative)+1))
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

     Eigenvalor  Varianza (%)  Varianza Acumulada (%)
PC1      4.6444         58.05                   58.05
PC2      0.8148         10.18                   68.24
PC3      0.6928          8.66                   76.90
PC4      0.6157          7.70                   84.59
PC5      0.5026          6.28                   90.88
PC6      0.3833          4.79                   95.67
PC7      0.2522          3.15                   98.82
PC8      0.0945          1.18                  100.00


### Coeficientes (Cargas) de los Componentes Principales

In [23]:
# Cargas de los 3 primeros componentes
n_components = 3
loadings = pd.DataFrame(
    pca_full.components_[:n_components].T,
    index=df_pca.columns,
    columns=[f'PC{i+1}' for i in range(n_components)]
)

print("Cargas (loadings) de los 3 componentes principales:")
print(loadings.round(4))

# Heatmap de cargas
plt.figure(figsize=(8, 6))
sns.heatmap(
    loadings, annot=True, fmt='.3f', cmap='RdYlGn',
    center=0, vmin=-1, vmax=1, linewidths=0.5,
    cbar_kws={'label': 'Carga'}
)
plt.title('Cargas de los Componentes Principales (PC1–PC3)', fontsize=13)
plt.ylabel('Variable original')
plt.tight_layout()
plt.show()

# Biplot PC1 vs PC2
pca_3 = PCA(n_components=3)
X_pca_3 = pca_3.fit_transform(X_pca_scaled)

plt.figure(figsize=(9, 7))
plt.scatter(X_pca_3[:, 0], X_pca_3[:, 1], alpha=0.3, s=5, color='steelblue')

scale = 3.5
for i, var in enumerate(df_pca.columns):
    plt.arrow(0, 0,
              pca_3.components_[0, i] * scale,
              pca_3.components_[1, i] * scale,
              head_width=0.08, head_length=0.05, fc='red', ec='red')
    plt.text(pca_3.components_[0, i] * scale * 1.15,
             pca_3.components_[1, i] * scale * 1.15,
             var, fontsize=9, color='darkred', ha='center')

plt.xlabel(f'PC1 ({variance_ratio[0]*100:.1f}%)', fontsize=11)
plt.ylabel(f'PC2 ({variance_ratio[1]*100:.1f}%)', fontsize=11)
plt.title('Biplot PCA — PC1 vs PC2', fontsize=13)
plt.axhline(0, color='gray', linewidth=0.5)
plt.axvline(0, color='gray', linewidth=0.5)
plt.tight_layout()
plt.show()

Cargas (loadings) de los 3 componentes principales:
                              PC1     PC2     PC3
runtime                    0.3044  0.0009 -0.6476
popularity                 0.4179 -0.0442  0.0129
voteAvg                    0.3914 -0.1318  0.0037
voteCount                  0.4325 -0.1532 -0.0289
genresAmount               0.2978 -0.1370  0.7488
productionCoAmount         0.3645  0.2958  0.0263
productionCountriesAmount  0.2624  0.8071  0.0430
releaseYear               -0.3196  0.4467  0.1283


### Preparación de datos para reglas de asociación

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
from mlxtend.frequent_patterns import apriori, association_rules
from mlxtend.preprocessing import TransactionEncoder

# Usamos las variables de características de película, excluyendo releaseYear y productionCountriesAmount
df_assoc = df_cluster.drop(columns=['Cluster', 'releaseYear', 'productionCountriesAmount'])

# Discretizar en 3 bins por cuantiles
disc = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='quantile', subsample=None)
df_disc = pd.DataFrame(disc.fit_transform(df_assoc), columns=df_assoc.columns)

labels = {0: 'bajo', 1: 'medio', 2: 'alto'}
df_named = df_disc.copy()
for col in df_named.columns:
    df_named[col] = col + '_' + df_named[col].map(labels)

# Encoding para mlxtend
te = TransactionEncoder()
te_array = te.fit_transform(df_named.values.tolist())
df_onehot = pd.DataFrame(te_array, columns=te.columns_)

print(f"Dataset para reglas: {df_onehot.shape[0]} transacciones, {df_onehot.shape[1]} ítems")
print("\nÍtems disponibles:", list(df_onehot.columns))

Dataset para reglas: 19883 transacciones, 16 ítems

Ítems disponibles: ['genresAmount_alto', 'genresAmount_bajo', 'genresAmount_medio', 'popularity_alto', 'popularity_bajo', 'popularity_medio', 'productionCoAmount_alto', 'productionCoAmount_bajo', 'productionCoAmount_medio', 'runtime_alto', 'runtime_bajo', 'runtime_medio', 'voteAvg_bajo', 'voteAvg_medio', 'voteCount_bajo', 'voteCount_medio']


### Exploración de parámetros de soporte y confianza

In [25]:
# Prueba con distintos valores de soporte y confianza
resultados = []
for sup in [0.30, 0.25, 0.20]:
    freq = apriori(df_onehot, min_support=sup, use_colnames=True)
    for conf in [0.80, 0.75, 0.70]:
        rules = association_rules(freq, metric='confidence', min_threshold=conf)
        resultados.append({
            'Soporte mín.': sup,
            'Confianza mín.': conf,
            'Itemsets frecuentes': len(freq),
            'Reglas generadas': len(rules)
        })

print(pd.DataFrame(resultados).to_string(index=False))

 Soporte mín.  Confianza mín.  Itemsets frecuentes  Reglas generadas
         0.30            0.80                   20                 7
         0.30            0.75                   20                 9
         0.30            0.70                   20                 9
         0.25            0.80                   36                15
         0.25            0.75                   36                23
         0.25            0.70                   36                24
         0.20            0.80                   61                30
         0.20            0.75                   61                44
         0.20            0.70                   61                51


In [None]:
# Generación final de reglas
freq_items = apriori(df_onehot, min_support=0.25, use_colnames=True)
rules = association_rules(freq_items, metric='confidence', min_threshold=0.70)
rules = rules.sort_values(['lift', 'confidence'], ascending=False).reset_index(drop=True)

rules['antecedents'] = rules['antecedents'].apply(lambda x: ', '.join(sorted(x)))
rules['consequents'] = rules['consequents'].apply(lambda x: ', '.join(sorted(x)))

display_cols = ['antecedents', 'consequents', 'support', 'confidence', 'lift']
print(rules[display_cols].round(4).to_string())

                        antecedents                   consequents  support  confidence    lift
0                   voteCount_medio       productionCoAmount_alto   0.2799      0.8391  1.8450
1                   popularity_bajo  voteAvg_bajo, voteCount_bajo   0.3105      0.9316  1.7492
2                   popularity_alto       productionCoAmount_alto   0.2527      0.7579  1.6664
3                      runtime_alto       productionCoAmount_alto   0.2514      0.7416  1.6305
4                   popularity_bajo                voteCount_bajo   0.3332      1.0000  1.5005
5           productionCoAmount_bajo                voteCount_bajo   0.2636      1.0000  1.5005
6     popularity_bajo, voteAvg_bajo                voteCount_bajo   0.3105      1.0000  1.5005
7                   voteCount_medio             genresAmount_alto   0.2891      0.8667  1.4881
8                      runtime_bajo  voteAvg_bajo, voteCount_bajo   0.2590      0.7888  1.4809
9        runtime_bajo, voteAvg_bajo               

In [None]:
plt.figure(figsize=(10, 6))
sc = plt.scatter(
    rules['support'], rules['confidence'],
    c=rules['lift'], cmap='YlOrRd', s=80, edgecolors='gray', linewidths=0.5
)
plt.colorbar(sc, label='Lift')
plt.xlabel('Soporte', fontsize=12)
plt.ylabel('Confianza', fontsize=12)
plt.title('Reglas de Asociación — Soporte vs Confianza (color = Lift)', fontsize=13)
plt.tight_layout()
plt.show()

# Top 10 por lift
print("\nTop 10 reglas por Lift:")
print(rules[display_cols].head(10).round(4).to_string(index=False))


Top 10 reglas por Lift:
                  antecedents                  consequents  support  confidence   lift
              voteCount_medio      productionCoAmount_alto   0.2799      0.8391 1.8450
              popularity_bajo voteAvg_bajo, voteCount_bajo   0.3105      0.9316 1.7492
              popularity_alto      productionCoAmount_alto   0.2527      0.7579 1.6664
                 runtime_alto      productionCoAmount_alto   0.2514      0.7416 1.6305
              popularity_bajo               voteCount_bajo   0.3332      1.0000 1.5005
      productionCoAmount_bajo               voteCount_bajo   0.2636      1.0000 1.5005
popularity_bajo, voteAvg_bajo               voteCount_bajo   0.3105      1.0000 1.5005
              voteCount_medio            genresAmount_alto   0.2891      0.8667 1.4881
                 runtime_bajo voteAvg_bajo, voteCount_bajo   0.2590      0.7888 1.4809
   runtime_bajo, voteAvg_bajo               voteCount_bajo   0.2590      0.9722 1.4588



### Otros Algoritmos de Aprendizaje No Supervisado — DBSCAN

In [None]:
from sklearn.cluster import DBSCAN

print(f"{'eps':>5} {'min_samples':>12} {'Clusters':>10} {'Ruido':>8} {'% Ruido':>9} {'Silhouette':>12}")
print("-" * 65)

for eps in [0.8, 1.0, 1.2]:
    for ms in [50, 100]:
        db = DBSCAN(eps=eps, min_samples=ms)
        labels_db = db.fit_predict(X_scaled)
        n_clusters = len(set(labels_db)) - (1 if -1 in labels_db else 0)
        n_noise = (labels_db == -1).sum()
        pct_noise = n_noise / len(labels_db) * 100
        if n_clusters >= 2:
            mask = labels_db != -1
            sil = silhouette_score(X_scaled[mask], labels_db[mask])
            print(f"{eps:>5} {ms:>12} {n_clusters:>10} {n_noise:>8} {pct_noise:>8.1f}% {sil:>12.4f}")
        else:
            print(f"{eps:>5} {ms:>12} {n_clusters:>10} {n_noise:>8} {pct_noise:>8.1f}%   {'N/A':>12}")

  eps  min_samples   Clusters    Ruido   % Ruido   Silhouette
-----------------------------------------------------------------
  0.8           50         14     4961     25.0%       0.1027
  0.8          100          9     7105     35.7%       0.1612
  1.0           50          5     1842      9.3%       0.1890
  1.0          100          4     2965     14.9%       0.2217
  1.2           50          3      649      3.3%       0.1249
  1.2          100          3     1188      6.0%       0.1719


In [None]:
db_final = DBSCAN(eps=1.0, min_samples=100)
labels_db = db_final.fit_predict(X_scaled)

df_pca['DBSCAN'] = labels_db

# Distribución de clusters
counts = pd.Series(labels_db).value_counts().sort_index()
print("Distribución de clusters DBSCAN:")
for idx, cnt in counts.items():
    label = 'Ruido' if idx == -1 else f'Cluster {idx}'
    print(f"  {label}: {cnt} películas ({cnt/len(labels_db)*100:.1f}%)")

mask = labels_db != -1
sil_db = silhouette_score(X_scaled[mask], labels_db[mask])
print(f"\nSilhouette Score (sin ruido): {sil_db:.4f}")

Distribución de clusters DBSCAN:
  Ruido: 2965 películas (14.9%)
  Cluster 0: 2484 películas (12.5%)
  Cluster 1: 2256 películas (11.3%)
  Cluster 2: 11460 películas (57.6%)
  Cluster 3: 718 películas (3.6%)

Silhouette Score (sin ruido): 0.2217


In [30]:
# Perfiles de cada cluster
print("Perfiles medios por cluster DBSCAN:")
profile = df_pca[df_pca['DBSCAN'] != -1].groupby('DBSCAN').mean()
print(profile.drop(columns=['DBSCAN'], errors='ignore').round(3))

Perfiles medios por cluster DBSCAN:
        runtime  popularity  voteAvg  voteCount  genresAmount  \
DBSCAN                                                          
0        52.397       0.075    0.100      0.022         0.939   
1        44.508       0.088    0.082      0.016         1.007   
2        91.239       2.383    4.878      4.328         2.231   
3        48.414       0.110    0.008      0.009         1.444   

        productionCoAmount  productionCountriesAmount  releaseYear  
DBSCAN                                                              
0                    0.000                      0.000     2025.254  
1                    0.000                      0.726     2025.228  
2                    1.183                      0.778     2014.766  
3                    0.772                      0.000     2025.234  


In [31]:
# Visualización DBSCAN con PCA 2D
pca_2d = PCA(n_components=2)
X_2d = pca_2d.fit_transform(X_scaled)

palette = {-1: 'lightgray', 0: '#e41a1c', 1: '#377eb8', 2: '#4daf4a', 3: '#984ea3', 4: '#ff7f00'}
cluster_names = {-1: 'Ruido', 0: 'Cluster 0', 1: 'Cluster 1', 2: 'Cluster 2', 3: 'Cluster 3'}

plt.figure(figsize=(10, 7))
for cluster_id in sorted(set(labels_db)):
    mask_c = labels_db == cluster_id
    alpha = 0.15 if cluster_id == -1 else 0.5
    size = 3 if cluster_id == -1 else 8
    plt.scatter(
        X_2d[mask_c, 0], X_2d[mask_c, 1],
        c=palette.get(cluster_id, 'black'),
        label=cluster_names.get(cluster_id, f'Cluster {cluster_id}'),
        s=size, alpha=alpha
    )

plt.xlabel(f'PC1 ({pca_2d.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca_2d.explained_variance_ratio_[1]*100:.1f}%)')
plt.title('DBSCAN — Visualización en espacio PCA 2D\n(eps=1.0, min_samples=100)', fontsize=13)
plt.legend(markerscale=3)
plt.tight_layout()
plt.show()

In [33]:
# Comparación DBSCAN vs K-Means
df_pca['KMeans'] = clusters

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# KMeans
for c in sorted(df_pca['KMeans'].unique()):
    mask_c = df_pca['KMeans'] == c
    axes[0].scatter(X_2d[mask_c, 0], X_2d[mask_c, 1], label=f'Cluster {c}', s=5, alpha=0.4)
axes[0].set_title('K-Means (K=3)', fontsize=12)
axes[0].set_xlabel('PC1'); axes[0].set_ylabel('PC2')
axes[0].legend(markerscale=3)

# DBSCAN
for cluster_id in sorted(set(labels_db)):
    mask_c = labels_db == cluster_id
    alpha = 0.1 if cluster_id == -1 else 0.4
    size = 3 if cluster_id == -1 else 6
    lbl = 'Ruido' if cluster_id == -1 else f'Cluster {cluster_id}'
    axes[1].scatter(X_2d[mask_c, 0], X_2d[mask_c, 1], 
                    c=palette.get(cluster_id, 'black'), label=lbl, s=size, alpha=alpha)
axes[1].set_title('DBSCAN (eps=1.0, min_samples=100)', fontsize=12)
axes[1].set_xlabel('PC1'); axes[1].set_ylabel('PC2')
axes[1].legend(markerscale=3)

plt.tight_layout()
plt.show()