In [42]:
#Tratamiento de datos:
import numpy as np
import pandas as pd

#Gráficos:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

#Preprocesado y modelado:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, RidgeCV, ElasticNetCV, LassoCV, SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score

#Configuración de warnings:
import warnings
warnings.filterwarnings('ignore')

## Lectura y separación de DF


In [43]:
df: pd.DataFrame = pd.read_csv('weatherAUS.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [44]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [45]:
y: pd.DataFrame = df[['RainTomorrow']]
X: pd.DataFrame = df.drop(columns='RainTomorrow')

Separación test y train conforme a las convenciones del 80% para Train y 20% para test.

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

## Visualización de datos y manejo de valores faltantes

In [47]:
X_train.sample(10)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
115203,2010-08-07,PearceRAAF,3.5,17.2,0.0,,9.8,S,35.0,SSE,...,19.0,65.0,42.0,1024.4,1022.8,,,11.7,17.1,No
67583,2009-07-19,Melbourne,8.2,15.5,0.0,3.0,9.1,N,63.0,N,...,39.0,68.0,57.0,1016.6,1014.9,3.0,7.0,10.8,14.4,No
1009,2011-10-06,Albury,10.2,16.0,0.0,,,SW,13.0,S,...,4.0,87.0,80.0,1014.8,1012.6,,8.0,13.0,15.0,No
85425,2012-06-18,Brisbane,9.2,21.2,0.0,3.2,9.9,WSW,20.0,WSW,...,2.0,56.0,41.0,1022.6,1018.9,0.0,0.0,15.1,20.9,No
58829,2010-10-19,Bendigo,2.1,18.3,0.0,,,W,26.0,S,...,9.0,57.0,39.0,1027.9,1026.2,1.0,,10.3,17.0,No
95317,2014-09-27,Townsville,19.8,29.6,0.0,5.4,10.1,NE,33.0,E,...,20.0,70.0,56.0,1017.2,1013.5,4.0,1.0,25.7,28.0,No
106984,2013-01-23,Woomera,18.9,36.2,0.0,15.0,13.3,SSE,52.0,SSE,...,17.0,45.0,10.0,1014.1,1010.9,0.0,0.0,23.2,33.5,No
13236,2012-04-14,Moree,12.2,26.3,0.0,12.6,,ENE,30.0,E,...,13.0,51.0,42.0,1027.2,1022.3,7.0,6.0,21.1,25.6,No
61495,2009-11-10,Sale,10.7,32.0,0.0,8.0,12.0,E,39.0,SW,...,30.0,51.0,22.0,1020.1,1018.3,0.0,1.0,21.8,31.1,No
133765,2010-12-05,Launceston,13.7,20.3,3.2,,,SSE,35.0,SSE,...,7.0,73.0,57.0,1017.9,1017.0,,,14.8,19.4,Yes


Vemos que hay una variable "Yes/No" en RainToday. La convertimos a Dummy.

In [48]:
def getting_dummy(string_: str) -> int:
    return 1 if string_ == 'Yes' else 0

In [49]:
X_train['RainToday'] = X_train['RainToday'].apply(getting_dummy)