<a href="https://colab.research.google.com/github/benmanjackson/CS290/blob/main/Rain_In_AUS_Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Naive-Bayes version of Rain in Australia.

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

In [37]:
columns = ["MinTemp", "MaxTemp", "Rainfall", "Evaporation", "Sunshine", "WindGustDir",
           "WindGustSpeed", "WindDir9am", "WindDir3pm", "WindSpeed9am", "WindSpeed3pm",
           "Humidity9am", "Humidity3pm", "Pressure9am", "Pressure3pm", "Cloud9am",
           "Cloud3pm", "Temp9am", "Temp3pm", "RainTomorrow"]

In [38]:
df = pd.read_csv("https://github.com/benmanjackson/CS290/raw/main/weatherAUS.csv")[columns]

In [39]:
df = df.dropna()

In [40]:
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainTomorrow'],
      dtype='object')

In [41]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
6049,17.9,35.2,0.0,12.0,12.3,SSW,48.0,ENE,SW,6.0,20.0,20.0,13.0,1006.3,1004.4,2.0,5.0,26.6,33.4,No
6050,18.4,28.9,0.0,14.8,13.0,S,37.0,SSE,SSE,19.0,19.0,30.0,8.0,1012.9,1012.1,1.0,1.0,20.3,27.0,No
6052,19.4,37.6,0.0,10.8,10.6,NNE,46.0,NNE,NNW,30.0,15.0,42.0,22.0,1012.3,1009.2,1.0,6.0,28.7,34.9,No
6053,21.9,38.4,0.0,11.4,12.2,WNW,31.0,WNW,WSW,6.0,6.0,37.0,22.0,1012.7,1009.1,1.0,5.0,29.1,35.6,No
6054,24.2,41.0,0.0,11.2,8.4,WNW,35.0,NW,WNW,17.0,13.0,19.0,15.0,1010.7,1007.4,1.0,6.0,33.6,37.6,No


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 56420 entries, 6049 to 142302
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MinTemp        56420 non-null  float64
 1   MaxTemp        56420 non-null  float64
 2   Rainfall       56420 non-null  float64
 3   Evaporation    56420 non-null  float64
 4   Sunshine       56420 non-null  float64
 5   WindGustDir    56420 non-null  object 
 6   WindGustSpeed  56420 non-null  float64
 7   WindDir9am     56420 non-null  object 
 8   WindDir3pm     56420 non-null  object 
 9   WindSpeed9am   56420 non-null  float64
 10  WindSpeed3pm   56420 non-null  float64
 11  Humidity9am    56420 non-null  float64
 12  Humidity3pm    56420 non-null  float64
 13  Pressure9am    56420 non-null  float64
 14  Pressure3pm    56420 non-null  float64
 15  Cloud9am       56420 non-null  float64
 16  Cloud3pm       56420 non-null  float64
 17  Temp9am        56420 non-null  float64
 18  Temp3pm

In [43]:
X = df.drop('RainTomorrow', axis=1)
y = df['RainTomorrow']

In [44]:
def naive_bayes_prediction(X: pd.DataFrame, y: pd.Series, new_instance: dict):
  # Identify categorical and numeric features
    categorical_features = ["WindGustDir", "WindDir9am", "WindDir3pm"]
    numeric_features = [col for col in X.columns if col not in categorical_features]

    # Preprocessing pipeline for numerical and categorical features
    numeric_transformer = SimpleImputer(strategy='mean')
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

    # Column transformer to handle both types of features
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    # Full pipeline with Gaussian Naive Bayes
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', GaussianNB())])

    # Fit the model on the training data
    model_pipeline.fit(X, y)

    # Convert new_instance dictionary into a DataFrame (to match feature columns)
    new_instance_df = pd.DataFrame([new_instance])

    # Ensure all columns are present (fill missing features in the new instance with NaN)
    new_instance_df = new_instance_df.reindex(columns=X.columns, fill_value=pd.NA)

    # Predict the most likely class for the new instance
    predicted_class = model_pipeline.predict(new_instance_df)

    return predicted_class[0]

In [45]:
#Instance values, chosen randomly
new_instance = {
    'MinTemp': 12.3,
    'MaxTemp': 22.4,
    'Rainfall': 0.5,
    'Evaporation': 5.2,
    'Sunshine': 8.5,
    'WindGustDir': 'NW',
    'WindGustSpeed': 31,
    'WindDir9am': 'NW',
    'WindDir3pm': 'W',
    'WindSpeed9am': 13,
    'WindSpeed3pm': 19,
    'Humidity9am': 75,
    'Humidity3pm': 55,
    'Pressure9am': 1012.3,
    'Pressure3pm': 1008.9,
    'Cloud9am': 3,
    'Cloud3pm': 2,
    'Temp9am': 15.0,
    'Temp3pm': 20.1
}

In [47]:
predicted_class = naive_bayes_prediction(X, y, new_instance)
print(f"Predicted class for the new instance: {predicted_class}")

Predicted class for the new instance: Yes
