In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
wetdf = pd.read_csv('weatherAUS.csv')

In [3]:
wetdf.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
wetdf.count()

Date             145460
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        142199
RainTomorrow     142193
dtype: int64

In [5]:
wetdf.dropna(subset=['RainToday','RainTomorrow'],inplace = True)

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
train_df,test_df = train_test_split(wetdf,test_size = 0.2, random_state = 42)

In [8]:
train_df,val_df = train_test_split(wetdf,test_size = 0.25, random_state = 42)

In [9]:
print('train_df.shape :', train_df.shape)
print('val_df.shape :', val_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (105590, 23)
val_df.shape : (35197, 23)
test_df.shape : (28158, 23)


In [10]:
input_cols = list(train_df.columns)[1:-1]
target_cols = 'RainTomorrow'
train_input = train_df[input_cols].copy()
train_target = train_df[target_cols].copy()
val_input = val_df[input_cols].copy()
val_target = val_df[target_cols].copy()
test_input = test_df[input_cols].copy()
test_target = test_df[target_cols].copy()

In [11]:
numeric_cols = train_input.select_dtypes(include=np.number).columns.tolist()[:-1]
categorical_cols = train_input.select_dtypes(include='object').columns.tolist()

In [12]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy = 'mean').fit(wetdf[numeric_cols])
train_input[numeric_cols] = imputer.transform(train_input[numeric_cols])
val_input[numeric_cols] = imputer.transform(val_input[numeric_cols])
test_input[numeric_cols] = imputer.transform(test_input[numeric_cols])

In [13]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler().fit(wetdf[numeric_cols])
train_input[numeric_cols] = scaler.transform(train_input[numeric_cols])
val_input[numeric_cols] = scaler.transform(val_input[numeric_cols])
test_input[numeric_cols] = scaler.transform(test_input[numeric_cols])

In [14]:
import warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(wetdf[categorical_cols])
encoded_cols = list(encoder.get_feature_names_out(categorical_cols))
train_input[encoded_cols] = encoder.transform(train_input[categorical_cols])
val_input[encoded_cols] = encoder.transform(val_input[categorical_cols])
test_input[encoded_cols] = encoder.transform(test_input[categorical_cols])

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [16]:
for_train = train_input[numeric_cols + encoded_cols]
for_val = val_input[numeric_cols + encoded_cols]
for_test = test_input[numeric_cols + encoded_cols]

In [17]:
model = LogisticRegression(solver = 'liblinear').fit(for_train,train_target)

In [18]:
pred = model.predict(for_train)
pred

array(['No', 'No', 'No', ..., 'No', 'No', 'Yes'], dtype=object)

In [19]:
prob = model.predict_proba(for_train)
prob

array([[0.95805068, 0.04194932],
       [0.84471686, 0.15528314],
       [0.96528823, 0.03471177],
       ...,
       [0.97822177, 0.02177823],
       [0.99241155, 0.00758845],
       [0.42515804, 0.57484196]])

In [20]:
acc = accuracy_score(train_target,pred)
acc

0.851472677336869

In [21]:
# Helper function to predict, compute accuracy
def predict(inputs,target):
    pred = model.predict(inputs)
    acc = accuracy_score(target,pred)
    print(f"Accuracy is {acc*100} %")

In [22]:
val_preds = predict(for_val, val_target)

Accuracy is 84.78279398812398 %


In [23]:
test_preds = predict(for_test,test_target)

Accuracy is 84.71482349598693 %


In [24]:
def random_guess(inputs):
    return np.random.choice(["Yes","No"],len(inputs))

In [25]:
def all_no(inputs):
    return np.full(len(inputs), "No")

In [26]:
def all_yes(inputs):
    return np.full(len(inputs), "Yes")

In [27]:
preds = predict(for_test,random_guess(for_test))

Accuracy is 49.68037502663542 %


In [28]:
preds = predict(for_test,all_no(for_test))

Accuracy is 84.62603878116343 %


In [29]:
preds = predict(for_test,all_yes(for_test))

Accuracy is 15.373961218836566 %


In [30]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    for_input = input_df[numeric_cols+encoded_cols]
    pred = model.predict(for_input)[0]
    prob = model.predict_proba(for_input)[0] #[list(model.classes_).index(pred)]
    return pred, prob

In [31]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 20,
             'Rainfall': 20,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 66,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}
predict_input(new_input)

('Yes', array([0.13691803, 0.86308197]))