In [1]:
# Librerías
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, jaccard_score, log_loss, classification_report
)

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [2]:
# Cargar datos desde el CSV local
df = pd.read_csv("Weather_Data.csv")

# Revisar primeras filas
df.head()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2/1/2008,19.5,22.4,15.6,6.2,0.0,W,41,S,SSW,...,92,84,1017.6,1017.4,8,8,20.7,20.9,Yes,Yes
1,2/2/2008,19.5,25.6,6.0,3.4,2.7,W,41,W,E,...,83,73,1017.9,1016.4,7,7,22.4,24.8,Yes,Yes
2,2/3/2008,21.6,24.5,6.6,2.4,0.1,W,41,ESE,ESE,...,88,86,1016.7,1015.6,7,8,23.5,23.0,Yes,Yes
3,2/4/2008,20.2,22.8,18.8,2.2,0.0,W,41,NNE,E,...,83,90,1014.2,1011.8,8,8,21.4,20.9,Yes,Yes
4,2/5/2008,19.7,25.7,77.4,4.8,0.0,W,41,NNE,W,...,88,74,1008.3,1004.8,8,8,22.5,25.5,Yes,Yes


In [10]:
# =====================================
# Preprocesamiento de datos
# =====================================

# Convertir variables categóricas a dummies
df_proc = pd.get_dummies(data=df, columns=['WindGustDir', 'WindDir9am', 'WindDir3pm'])

# Reemplazar 'No'/'Yes' por 0/1 de forma explícita en RainToday y RainTomorrow
cols_binarias = ['RainToday', 'RainTomorrow']
for col in cols_binarias:
    df_proc[col] = df_proc[col].map({'No':0, 'Yes':1})

# Eliminar columna Date
df_proc.drop('Date', axis=1, inplace=True)

# Asegurarse que todos los datos sean float
df_proc = df_proc.astype(float)

# Separar variables predictoras y target
X = df_proc.drop(columns='RainTomorrow', axis=1)
y = df_proc['RainTomorrow']

# Comprobar que todo está correcto
print("Dimensiones X:", X.shape)
print("Dimensiones y:", y.shape)
print("Primeras filas de X:")
print(X.head())

Dimensiones X: (3271, 65)
Dimensiones y: (3271,)
Primeras filas de X:
   MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
0     19.5     22.4      15.6          6.2       0.0           41.0   
1     19.5     25.6       6.0          3.4       2.7           41.0   
2     21.6     24.5       6.6          2.4       0.1           41.0   
3     20.2     22.8      18.8          2.2       0.0           41.0   
4     19.7     25.7      77.4          4.8       0.0           41.0   

   WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  WindDir3pm_NNW  \
0          17.0          20.0         92.0         84.0  ...             0.0   
1           9.0          13.0         83.0         73.0  ...             0.0   
2          17.0           2.0         88.0         86.0  ...             0.0   
3          22.0          20.0         83.0         90.0  ...             0.0   
4          11.0           6.0         88.0         74.0  ...             0.0   

   WindDir3pm_NW  Wind

In [4]:
# Revisar columnas y tipo de datos
print(df_proc.dtypes)
print("\nCantidad de filas y columnas:", df_proc.shape)

MinTemp           float64
MaxTemp           float64
Rainfall          float64
Evaporation       float64
Sunshine          float64
                   ...   
WindDir3pm_SSW    float64
WindDir3pm_SW     float64
WindDir3pm_W      float64
WindDir3pm_WNW    float64
WindDir3pm_WSW    float64
Length: 67, dtype: object

Cantidad de filas y columnas: (3271, 67)


In [5]:
# Separar en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [6]:
# Inicializar modelos
knn = KNeighborsClassifier(n_neighbors=4)
tree = DecisionTreeClassifier()
lr = LogisticRegression(solver='liblinear')
svm = SVC(probability=True)

# Entrenar modelos
knn.fit(X_train, y_train)
tree.fit(X_train, y_train)
lr.fit(X_train, y_train)
svm.fit(X_train, y_train)

In [11]:
# Predicciones
pred_knn = knn.predict(X_test)
pred_tree = tree.predict(X_test)
pred_lr = lr.predict(X_test)
pred_svm = svm.predict(X_test)

# Probabilidades (solo para LogLoss)
proba_lr = lr.predict_proba(X_test)
proba_svm = svm.predict_proba(X_test)

In [8]:
# Métricas de evaluación
results = {
    "KNN": {
        "Accuracy": accuracy_score(y_test, pred_knn),
        "Jaccard": jaccard_score(y_test, pred_knn),
        "F1": f1_score(y_test, pred_knn),
        "LogLoss": None
    },
    "Decision Tree": {
        "Accuracy": accuracy_score(y_test, pred_tree),
        "Jaccard": jaccard_score(y_test, pred_tree),
        "F1": f1_score(y_test, pred_tree),
        "LogLoss": None
    },
    "Logistic Regression": {
        "Accuracy": accuracy_score(y_test, pred_lr),
        "Jaccard": jaccard_score(y_test, pred_lr),
        "F1": f1_score(y_test, pred_lr),
        "LogLoss": log_loss(y_test, proba_lr)
    },
    "SVM": {
        "Accuracy": accuracy_score(y_test, pred_svm),
        "Jaccard": jaccard_score(y_test, pred_svm),
        "F1": f1_score(y_test, pred_svm),
        "LogLoss": log_loss(y_test, proba_svm)
    }
}

# Mostrar resultados
metrics_df = pd.DataFrame(results).T
metrics_df


Unnamed: 0,Accuracy,Jaccard,F1,LogLoss
KNN,0.818321,0.425121,0.59661,
Decision Tree,0.763359,0.410646,0.58221,
Logistic Regression,0.838168,0.518182,0.682635,0.357721
SVM,0.719084,0.0,0.0,0.392747


In [12]:
# =====================================
# Reportes de clasificación (sin warnings)
# =====================================
from sklearn.metrics import classification_report

print("\n=== Informes de Clasificación ===")

# KNN
print("\nKNN:\n", classification_report(y_test, pred_knn, zero_division=0))

# Árbol de Decisión
print("\nÁrbol de Decisión:\n", classification_report(y_test, pred_tree, zero_division=0))

# Regresión Logística
print("\nRegresión Logística:\n", classification_report(y_test, pred_lr, zero_division=0))

# SVM
print("\nSVM:\n", classification_report(y_test, pred_svm, zero_division=0))



=== Informes de Clasificación ===

KNN:
               precision    recall  f1-score   support

         0.0       0.82      0.95      0.88       471
         1.0       0.79      0.48      0.60       184

    accuracy                           0.82       655
   macro avg       0.81      0.71      0.74       655
weighted avg       0.81      0.82      0.80       655


Árbol de Decisión:
               precision    recall  f1-score   support

         0.0       0.84      0.83      0.83       471
         1.0       0.58      0.59      0.58       184

    accuracy                           0.76       655
   macro avg       0.71      0.71      0.71       655
weighted avg       0.76      0.76      0.76       655


Regresión Logística:
               precision    recall  f1-score   support

         0.0       0.86      0.92      0.89       471
         1.0       0.76      0.62      0.68       184

    accuracy                           0.84       655
   macro avg       0.81      0.77      0.7