## Descripción breve
El siguiente código permite comparar rendimiento medio de los modelos utilizados utilizando la muestra en su totalidad.
El mismo proporciona el rendiemiento al usar cross varidation con 10 divisiones para cada modelo, proporciona medidas de las curvas de roc y la matriz de confusión de cada modelo sobre una división del 80% de la muestra en entrenamiento y 20% en testeo



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  mean_squared_error, RocCurveDisplay
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier



nombres =  [
    "ID" ,"Diagnosis",
    "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1",
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1",
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2",
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2",
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
df = pd.read_csv('wdbc.data', names = nombres)
data=df.copy()
data.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
data.drop(columns=["ID"], inplace=True)

tags= nombres
del tags[0]
data1=df.copy()
data1.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
data1.drop(columns=["ID"], inplace=True)

models=[
  LogisticRegression(max_iter=1000),DecisionTreeClassifier(),RandomForestClassifier(n_estimators = 200),QuadraticDiscriminantAnalysis(),
  LinearDiscriminantAnalysis(), SVC(kernel='linear', C=1.0, probability=True),  SVC(kernel='sigmoid', probability=True ),   SVC(kernel='rbf', probability=True ),  GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=100, random_state=42)
  ]



model_name=[
  'LogisticRegression','DecisionTreeClassifier' ,'RandomForestClassifier','QuadraticDiscriminantAnalysis',
  'LinearDiscriminantAnalysis', 'SVC_linear',  'SVC_sigmoid', 'SVC_rbf', ' GradientBoostingClassifier'
  ]
colors = [
    "#1f77b4",  # azul
    "#ff7f0e",  # naranja
    "#2ca02c",  # verde
    "#d62728",  # rojo
    "#9467bd",  # violeta
    "#8c564b",  # marrón
    "#e377c2",   # rosado
    "b"
]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)
model_trained={}
model_score={}
for i in range(len(models)):
    model=make_pipeline(StandardScaler(),models[i])
    score=cross_val_score(model, X_train, y_train, cv=20)
    model_score[model_name[i]]=[model,score]
    resulting=models[i].fit(X_train_scaled, y_train)
    result=resulting.predict(X_test_scaled)
    model_trained[model_name[i]]=[result, resulting]
    print(f"{model_name[i]}: {score.mean()}")
plt.figure(figsize=(10,6))
scores_list = []
for name in model_name:
    for fold, score in enumerate(model_score[name][1]):
        scores_list.append({
            "Modelo": name,
            "Fold": fold + 1,
            "Score": score
        })
scores_df = pd.DataFrame(scores_list)

# 2. Graficar con stripplot (que incluye jitter automáticamente)
plt.figure(figsize=(12, 7))
sns.stripplot(x="Modelo", y="Score", data=scores_df,
              jitter=True,  # <--- La magia está aquí
              alpha=0.7,    # <--- También puedes añadir alpha
              palette=colors) # Usa tu paleta de colores
plt.title("Comparación del Rendimiento (Strip Plot con Jitter)")
plt.ylabel("Accuracy (Rendimiento)")
plt.xticks(rotation=45, ha="right")
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(20,20))
for i in range(9):
  a1=i//3
  a2=i%3
  model=model_trained[model_name[i]][1]

  Aplicar=model.predict_proba(X_test_scaled)[:,1]
  RocCurveDisplay.from_predictions(y_test, Aplicar, ax=ax[a1,a2])
  ax[a1,a2].set_title(f"{model_name[i]}")
plt.savefig("RocCurves.png")
plt.show()

fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(20,20))
for i in range(9):
  a1=i//3
  a2=i%3
  model=model_trained[model_name[i]]
  cm = confusion_matrix(y_test, model_trained[model_name[i]][0], labels=[1,0])
  sns.heatmap(cm, annot=True,ax=ax[a1,a2], fmt="d", cmap="Blues", xticklabels=["Maligno","Benigno"], yticklabels=["Maligno","Benigno"])
  ax[a1,a2].set_xlabel("Predicho")
  ax[a1,a2].set_ylabel("Real")
  ax[a1,a2].set_title(f"{model_name[i]}")
  print(model_name[i])
  print(classification_report(y_test, model_trained[model_name[i]][0]))
plt.savefig("ConfusionMatrix.png")
plt.show()

ModuleNotFoundError: No module named 'pandas'

## Descripción del bloque
El siguiente bloque de código compara el rendimiento por modelo en cada muestra.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  mean_squared_error, RocCurveDisplay
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
def entrenar_crossvalidation(models, model_name, colors, X_train, y_train, ax, namep):
  model_score={}
  for i in range(len(models)):
      model=make_pipeline(StandardScaler(),models[i])
      score=cross_val_score(model, X_train, y_train, cv=10)
      model_score[model_name[i]]=[model,score]
  scores_list = []
  for name in model_name:
      for fold, score in enumerate(model_score[name][1]):
          scores_list.append({
              "Modelo": name,
              "Fold": fold + 1,
              "Score": score
          })
  scores_df = pd.DataFrame(scores_list)
  sns.stripplot(x="Modelo", y="Score", data=scores_df,
                jitter=True,
                alpha=0.7,
                palette=colors, ax=ax)

  ax.set_title(namep)
  ax.set_ylabel("Accuracy por muestra")
  ax.set_xticklabels(model_name, rotation=45, ha="right")
  ax.grid(axis='y', linestyle='--', alpha=0.7)
  return model_score
nombres =  [
    "ID" ,"Diagnosis",
    "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1",
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1",
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2",
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2",
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
df = pd.read_csv('wdbc.data', names = nombres)
df.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
df.drop(columns=["ID"], inplace=True)
tags= nombres
del tags[0]
data={}
for j in range(4):
  data1=df.copy()
  if j==0:
      tag= ["perimeter","area" ]
  elif j==1:
      tag = ["perimeter","area", "compactness", "concavity", "fractal_dimension"]
  elif j==2:
      tag = ["perimeter","area", "compactness", "concavity" ,"fractal_dimension", "symmetry"]
  elif j==3:
       tag = ["perimeter","area", "compactness", "concavity" ,"texture", "smoothness","fractal_dimension", "symmetry"]
  for i in range(3):
      tags = [c+str(i+1) for c in tag]
      data1.drop(columns=tags, inplace=True)
  data1.drop(columns= 'radius3', inplace=True)
  X=data1.drop(columns=["Diagnosis"]).to_numpy()
  Y=data1["Diagnosis"].to_numpy()
  data[j]=[X,Y]
models=[
  LogisticRegression(max_iter=1000),DecisionTreeClassifier(),RandomForestClassifier(n_estimators = 200),QuadraticDiscriminantAnalysis(),
  LinearDiscriminantAnalysis(), SVC(kernel='linear', C=1.0, probability=True),  SVC(kernel='sigmoid', probability=True ),   SVC(kernel='rbf', probability=True ),  GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=100, random_state=42)
  ]

model_name=[
  'LogisticRegression','DecisionTreeClassifier' ,'RandomForestClassifier','QuadraticDiscriminantAnalysis',
  'LinearDiscriminantAnalysis', 'SVC_linear',  'SVC_sigmoid', 'SVC_rbf', ' GradientBoostingClassifier'
  ]
colors = [
    "#1f77b4",  # azul
    "#ff7f0e",  # naranja
    "#2ca02c",  # verde
    "#d62728",  # rojo
    "#9467bd",  # violeta
    "#8c564b",  # marrón
    "#e377c2",   # rosado
    "b"
]
names=["data_1", "data_2", "data_3", "data_4"]
fig, ax= plt.subplots(ncols=2, nrows=2, figsize=(20,20))
model_score={}
for i in range(4):
  a1=i//2
  a2=i%2
  X=data[i][0]
  y=data[i][1]
  model_score[i]=entrenar_crossvalidation(models=models, model_name=model_name, colors=colors, X_train=X, y_train=y, ax=ax[a1,a2], namep=f"data{i}")

plt.tight_layout()
plt.savefig("Comparacion.png")
plt.show()
vector=[]
for i in range(4):
  vect=[]
  score=model_score[i]
  for name in model_name:
    vect.append(score[name][1].mean())
  vector.append(vect)
dataframe=pd.DataFrame(vector, columns=model_name)
dataframe.head()

## Descripción
El siguirnte código muestra los histogramas por covariable

In [None]:
data=df.copy()
data.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)

dataA=data[data['Diagnosis']==1]
dataB=data[data['Diagnosis']==0]

dataA=dataA.drop(columns=["Diagnosis"])
dataB=dataB.drop(columns=["Diagnosis"])
tagsA=dataA.keys()
tagsB=dataB.keys()
print(len(tagsA))
print(len(tagsB))
fig, ax = plt.subplots(ncols=5, nrows=6, figsize=(30,30))
for i in range(30):
  a1=i//5
  a2=i%5
  ax[a1,a2].set_title(tagsA[i])
  ax[a1,a2].hist(dataA[tagsA[i]], color='r', alpha=0.5, bins=30,label=f"Diagnosis:{1}")
  ax[a1,a2].hist(dataB[tagsB[i]], color='b', alpha=0.5, bins=30, label=f"Diagnosis:{0}")
  ax[a1,a2].legend()
plt.savefig("Histogramas.png")
plt.tight_layout()
plt.show()



## Descripción
Códigos que permiten obtener los datos de PCA

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  mean_squared_error, RocCurveDisplay
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
nombres =  [
    "ID" ,"Diagnosis",
    "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1",
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1",
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2",
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2",
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
df = pd.read_csv('wdbc.data', names = nombres)
data=df.copy()
data.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
data.drop(columns=["ID"], inplace=True)


n=30
tags= nombres
del tags[0]
data1=df.copy()
data1.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
data1.drop(columns=["ID"], inplace=True)
X=data1.drop(columns=["Diagnosis"]).to_numpy()
Y=data1["Diagnosis"].to_numpy()

X_scaled = StandardScaler().fit_transform(X)
pca = PCA(n_components=n)
principalComponents = pca.fit_transform(X_scaled)
principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component '+str(i) for i in range(1,n+1)])
plt.figure(figsize=(10,6))
plt.title("PCA")
plt.xlabel("principal component 1")
plt.ylabel("principal component 2")
plt.scatter(principalDf["principal component 1"], principalDf["principal component 2"], c=Y, cmap="viridis")
plt.show()
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_[:10].sum())
fig, ax = plt.subplots(ncols=3, nrows=3, figsize=(20,20))
for i in range(3*3):
  j=i//3
  k=i%3
  ax[j,k].scatter(principalDf["principal component "+str(1)], principalDf["principal component "+str(k+j*3+2)], c=Y, cmap="viridis")
  ax[j,k].set_xlabel(f"principal component {1}")
  ax[j,k].set_ylabel(f"principal component {j*3+k+2}")
  plt.savefig("PCA.png")
plt.tight_layout()
plt.show()
tags=data1.drop(columns=["Diagnosis"]).keys()
m=9
fig, ax = plt.subplots(nrows=3, ncols=3, figsize=(20,20)) # Changed nrows from 3 to 4
for j in range(m):
  z1=j//3
  z2=j%3
  ax[z1,z2].set_title(f"PCA component {j+1}")
  ax[z1,z2].bar(tags,np.abs(pca.components_[j])/np.abs(pca.components_[j]).sum())
  ax[z1,z2].set_xticklabels(tags, rotation=90)
plt.tight_layout()
plt.savefig("PCAHistograma.png")
plt.show()




## Descripción
El siguiente código genera la matriz de correlación, hay multiples porque en principio consideramos incluir solo secciones dada la enorme cantidad de covariables

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  mean_squared_error
nombres =  [
    "ID" ,"Diagnosis",
    "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1",
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1",
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2",
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2",
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
df = pd.read_csv('wdbc.data', names = nombres)
data=df.copy()
data.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
data.head()
data.drop(columns=["ID"], inplace=True)
data.head()
for i in range(3):
  plt.figure(figsize=(10, 10))
  tags = ["Diagnosis",
      "radius", "texture", "perimeter", "area", "smoothness", "compactness",
      "concavity", "concave_points", "symmetry", "fractal_dimension"]
  tags = [c+str(i+1) for c in tags]
  tags[0]="Diagnosis"
  data1 = data[tags]
  Corr_matrix1 =  data1.corr(numeric_only=True)
  sns.heatmap(Corr_matrix1, annot=True, cmap='coolwarm')
  #  plt.savefig("correlation_Matrix"+str(i+1)+".png")
  plt.show()
for j in range(3):
  for i in range(3):
      if i<=j:
        continue
      plt.figure(figsize=(20, 20))
      tags = ["Diagnosis",
          "radius", "texture", "perimeter", "area", "smoothness", "compactness",
          "concavity", "concave_points", "symmetry", "fractal_dimension"]
      tags = [c+str(i+1) for c in tags]
      tags[0]="Diagnosis"
      tagsx = ["Diagnosis",
          "radius", "texture", "perimeter", "area", "smoothness", "compactness",
          "concavity", "concave_points", "symmetry", "fractal_dimension"]
      tagsx = [c+str(j+1) for c in tagsx]
      tagsx[0]="Diagnosis"
      tagsy=tags+tagsx
      data1 = data[tagsy]
      Corr_matrix1 =  data1.corr(numeric_only=True)
      sns.heatmap(Corr_matrix1, annot=True, cmap='coolwarm')
      # plt.savefig("correlation_Matrix"+str(i+1)+str(j+1)+".png")
      plt.show()
Corr=data.corr(numeric_only=True)
plt.figure(figsize=(20,20))
sns.heatmap(Corr, annot=True, cmap='coolwarm')
plt.savefig("Correlation_Matrix.png")
plt.show()


## Descripción
Esto no se incluyo en el informe, pero refuerza parte de su análisis y es la relevancia de las covariables en random forest sobre toda la muestra (sin limpiar ningun dato).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_text
from collections import Counter
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score,  mean_squared_error
nombres =  [
    "ID" ,"Diagnosis",
    "radius1", "texture1", "perimeter1", "area1", "smoothness1", "compactness1",
    "concavity1", "concave_points1", "symmetry1", "fractal_dimension1",
    "radius2", "texture2", "perimeter2", "area2", "smoothness2", "compactness2",
    "concavity2", "concave_points2", "symmetry2", "fractal_dimension2",
    "radius3", "texture3", "perimeter3", "area3", "smoothness3", "compactness3",
    "concavity3", "concave_points3", "symmetry3", "fractal_dimension3"
]
data = pd.read_csv('wdbc.data', names = nombres)
data.drop(columns=["ID"], inplace=True)
data.replace({"Diagnosis":{'M': 1, 'B': 0}}, inplace=True)
y=data["Diagnosis"].to_numpy()
X=data.drop(columns=["Diagnosis"]).to_numpy()





X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify= y)
Tree=RandomForestClassifier(n_estimators=40, oob_score=True, random_state= 42)
Tree.fit(X_train, y_train)
print("Accuracy test:", Tree.score(X_test, y_test))
print(classification_report(y_test, Tree.predict(X_test)))




importances = Tree.feature_importances_

feat_names = data.drop(columns=["Diagnosis"]).columns

importancia = pd.DataFrame({
    "Feature": feat_naems,
    "Importance": importances
}).sort_values("Importance", ascending=False)

print(importancia.head(10))


plt.figure(figsize=(10,6))
plt.barh(importancia["Feature"].head(10), importancia["Importance"].head(10))
plt.title("Top 10 variables más importantes en Random Forest")
plt.gca().invert_yaxis()
plt.savefig("RandomForestTop.png")
plt.show()
RocCurveDisplay.from_predictions(y_test, Tree.predict_proba(X_test)[:,1])
plt.title("Curva ROC - Random Forest")
plt.savefig("RandomForestROC.png")
plt.show()
cm = confusion_matrix(y_test, Tree.predict(X_test), labels=[1,0])
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Maligno","Benigno"], yticklabels=["Maligno","Benigno"])
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title("Matriz de confusión - Random Forest")
plt.savefig("RandomForestCM.png")
plt.show()