# Heuristic Models For Seattle Weather

In [1]:
# import packges 
import pandas as pd
import numpy as np
%matplotlib inline
import time
import random
from random import randint
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# Load Data
df = pd.read_csv('https://raw.githubusercontent.com/Yasmeenmad/data_science_bootcamp/main/Week10/Intro_Machine_Learning1/seattle_weather_1948-2017.csv')

In [3]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True


In [4]:
# droop the NaN values
df = df.dropna()
# set the type of 'RAIN' column to bool 
df["RAIN"] = df["RAIN"].astype(bool)

In [5]:
# Separate your data set into training and testing. (80/20 split)
# Split into training and test sets

training, testing = train_test_split(
    df, 
    train_size=0.8, # 80% of data to train
    test_size=0.2, # 20% of data to test
)

In [6]:
# our baseline model
df["RAIN"].value_counts(normalize=True)

False    0.573352
True     0.426648
Name: RAIN, dtype: float64

In [7]:
# Make sure we see the same class distribution in our test data labels
# Check class labels distribution in training
training["RAIN"].value_counts(normalize=True)

False    0.574127
True     0.425873
Name: RAIN, dtype: float64

In [8]:
# Check class labels distribution in testing
testing["RAIN"].value_counts(normalize=True)

False    0.570254
True     0.429746
Name: RAIN, dtype: float64

### Classification Heuristic Model

In [9]:
# Create function to perform our heuristic
def heuristic_rain(df):
    
    """
    Simple heuristic:
    
    If it rained yesterday or the day before, if max temperature from 30 to 70 for yesterday or the day before, if min 
    temperature from 25 to 60 for yesterday or the day before and if precipitation of rain range from 0.01 to 5.02 
    for yesterday or the day before then it will rain else predict no rain
    
    Frist two rows are predicted false be default
    """
    
    preds = []
    for x in range(len(df)):
        # If first two rows then predict false
        if x <2:
            preds.append(False)
        else:

            if ((df.iloc[x-1]["RAIN"] == True) & (df.iloc[x-2]["RAIN"] == True) &
                ((df.iloc[x-1]["TMAX"] <= 70) & (df.iloc[x-1]["TMAX"] >= 30)) &
                ((df.iloc[x-2]["TMAX"] <= 70) & (df.iloc[x-2]["TMAX"] >= 30)) &
                ((df.iloc[x-1]["TMIN"] <= 60) & (df.iloc[x-1]["TMIN"] >= 25)) &
                ((df.iloc[x-2]["TMIN"] <= 60) & (df.iloc[x-2]["TMIN"] >= 25)) &
                ((df.iloc[x-1]["PRCP"] <= 5.02) & (df.iloc[x-1]["PRCP"] >= 0.01)) &
                ((df.iloc[x-2]["PRCP"] <= 5.02) & (df.iloc[x-2]["PRCP"] >= 0.01))):
                preds.append(True)

                
            else:
                # Predict false if the above is not true
                preds.append(False)
    return preds

In [10]:
# sort index
training = training.sort_index()

In [11]:
# add the rain prediction column to the dataframe
training['preds_rain'] = heuristic_rain(training)

In [12]:
# show the dataframe
training

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds_rain
0,1948-01-01,0.47,51,42,True,False
1,1948-01-02,0.59,45,36,True,False
2,1948-01-03,0.42,45,35,True,True
3,1948-01-04,0.31,45,34,True,True
7,1948-01-08,0.04,48,35,True,True
...,...,...,...,...,...,...
25545,2017-12-09,0.00,44,29,False,False
25547,2017-12-11,0.00,49,29,False,False
25548,2017-12-12,0.00,46,32,False,False
25549,2017-12-13,0.00,48,34,False,False


### Regression Heuristic Model

In [14]:
# Create function to perform our heuristic
def heuristic_prcp(df):
    
    """
    Simple heuristic:
    If it rained yesterday and today is raining then will predict a rain precipitation from 0.02 to 0.08
    If If it didn't rain yesterday and today is raining then will predict a rain precipitation from 0.01 to 0.02
    
    """
    
    preds = []
    for x in range(len(df)):
        # If first two rows then predict false
        if x <2:
            preds.append(0)
        else:
            if ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == False)):
                precipitation = round((float(random.randint(100,200)) / 10000),2)
                preds.append(precipitation)
            
            elif ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == True) & ( df["RAIN"].iloc[x-2] == True)):
                precipitation = round((float(random.randint(1000,1500)) / 10000),2)
                preds.append(precipitation)
            
            elif ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == True)):
                precipitation = round((float(random.randint(200,800)) / 10000),2)
                preds.append(precipitation)
            
   
            else:
                # Predict 0 if the above is not true
                preds.append(0)
    return preds

In [15]:
# add the prcp prediction column to the dataframe
training['preds_prcp'] = heuristic_prcp(training)

In [16]:
# show the dataframe
training

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds_rain,preds_prcp
0,1948-01-01,0.47,51,42,True,False,0.00
1,1948-01-02,0.59,45,36,True,False,0.00
2,1948-01-03,0.42,45,35,True,True,0.11
3,1948-01-04,0.31,45,34,True,True,0.14
7,1948-01-08,0.04,48,35,True,True,0.11
...,...,...,...,...,...,...,...
25545,2017-12-09,0.00,44,29,False,False,0.00
25547,2017-12-11,0.00,49,29,False,False,0.00
25548,2017-12-12,0.00,46,32,False,False,0.00
25549,2017-12-13,0.00,48,34,False,False,0.00


### Model Evaluation Functions

In [13]:
# create a function that calculate the the Precision and Recall for the classification heuristic model
# for the 'RAIN' column
def classification_model_evaluation(true_val, pred_val):
    # calculate the True Positive, False Positive, False Negative using the confusion_matrix
    # to calculate Precision and Recall
    TP = confusion_matrix(training['RAIN'], training['preds_rain'])[1][1]
    FP = confusion_matrix(training['RAIN'], training['preds_rain'])[0][1]
    FN = confusion_matrix(training['RAIN'], training['preds_rain'])[1][0]
    
    # calculate Precision and Recall
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    return "Precision = {val1} , Recall = {val2} ".format(val1 = precision, val2 = recall)


# create a function that calculate the MSE, MAE and SSE for the regression heuristic model
# for the 'PRCP' column    
def regression_model_evaluation(true_val, pred_val):
    # calculate the MSE, MAE, SSE
    SSE = sum((pred_val - true_val)**2)
    MSE = (1/len(true_val))* SSE
    MAE = (1/len(true_val))* sum(abs(pred_val - true_val))
    return "Sum Of Squared Error = {val1} , Mean Squared Error = {val2}, Mean Absolute Error = {val3} ".format(val1 = SSE, val2 = MSE, val3 = MAE)

In [17]:
# passing the actual values 'RAIN' column and the predicted values 'preds_rain' column
# in classification_model_evaluation to show the Precision and Recall
classification_model_evaluation(training['RAIN'],training['preds_rain'])

'Precision = 0.6675471698113208 , Recall = 0.4064797794117647 '

In [18]:
# passing the actual values 'PRCP' column and the predicted values 'preds_prcp' column
# in regression_model_evaluation to show the SSE, MSE and MAE
regression_model_evaluation(training['PRCP'],training['preds_prcp'])

'Sum Of Squared Error = 1118.8152000000161 , Mean Squared Error = 0.05474191212447481, Mean Absolute Error = 0.08843624620803993 '

In [22]:
# we have run the code 5 times and saved the results
# the Precision results for each run
Precision = [0.6659719590754074 , 0.6661648475724501, 0.6701590271281571,
             0.6664799253034547, 0.6675471698113208] 

# the Recall results for each run
Recall = [0.4048140043763676, 0.405730659025788, 0.40960548885077186, 
          0.40681636840305485, 0.4064797794117647] 

# the SSE results for each run
SSE = [1143.175100000017 , 1133.8321000000171, 1075.4262000000153,
       1148.6816000000147, 1118.8152000000161] 

# the MSE results for each run
MSE = [0.05593380467756224, 0.055476666014287954, 0.05261895488795457, 
       0.056203229278795126, 0.05474191212447481] 

# the MAE results for each run
MAE = [0.08890155592523351, 0.0885649280751503, 0.08718563460220793, 
       0.08988648595752653, 0.08843624620803993] 

In [23]:
# create a function to calculate the average of a list
def Average(lst):
    return sum(lst) / len(lst)

In [33]:
# print the results
print("The Precision average score is: ",Average(Precision)), 
print("The Recall average score is: ",Average(Recall)),
print("The SSE average score is: ",Average(SSE)),
print("The MSE average score is: ",Average(MSE)), 
print("The MAE average score is: ",Average(MAE))

The Precision average score is:  0.6672645857781581
The Recall average score is:  0.40668926001354944
The SSE average score is:  1123.9860400000161
The MSE average score is:  0.05499491339661494
The MAE average score is:  0.08859497015363163
