# Heuristic Models For Seattle Weather

In [1]:
# import packges 
import pandas as pd
import numpy as np
%matplotlib inline
import time
import random
from random import randint
import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
# Load Data
df = pd.read_csv('https://raw.githubusercontent.com/Yasmeenmad/data_science_bootcamp/main/Week10/Intro_Machine_Learning1/seattle_weather_1948-2017.csv')

In [3]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True


In [4]:
# droop the NaN values
df = df.dropna()
# set the type of 'RAIN' column to bool 
df["RAIN"] = df["RAIN"].astype(bool)

In [5]:
# Separate your data set into training and testing. (80/20 split)
# Split into training and test sets

training, testing = train_test_split(
    df, 
    train_size=0.8, # 80% of data to train
    test_size=0.2, # 20% of data to test
)

In [6]:
# our baseline model
df["RAIN"].value_counts(normalize=True)

False    0.573352
True     0.426648
Name: RAIN, dtype: float64

In [7]:
# Make sure we see the same class distribution in our test data labels
# Check class labels distribution in training
training["RAIN"].value_counts(normalize=True)

False    0.577258
True     0.422742
Name: RAIN, dtype: float64

In [8]:
# Check class labels distribution in testing
testing["RAIN"].value_counts(normalize=True)

False    0.55773
True     0.44227
Name: RAIN, dtype: float64

### Classification Heuristic Model

In [9]:
# Create function to perform our heuristic
def heuristic_rain(df):
    
    """
    Simple heuristic:
    
    If it rained yesterday or the day before, if max temperature from 30 to 70 for yesterday or the day before, if min 
    temperature from 25 to 60 for yesterday or the day before and if precipitation of rain range from 0.01 to 5.02 
    for yesterday or the day before then it will rain else predict no rain
    
    Frist two rows are predicted false be default
    """
    
    preds = []
    for x in range(len(df)):
        # If first two rows then predict false
        if x <2:
            preds.append(False)
        else:

            if ((df.iloc[x-1]["RAIN"] == True) & (df.iloc[x-2]["RAIN"] == True) &
                ((df.iloc[x-1]["TMAX"] <= 70) & (df.iloc[x-1]["TMAX"] >= 30)) &
                ((df.iloc[x-2]["TMAX"] <= 70) & (df.iloc[x-2]["TMAX"] >= 30)) &
                ((df.iloc[x-1]["TMIN"] <= 60) & (df.iloc[x-1]["TMIN"] >= 25)) &
                ((df.iloc[x-2]["TMIN"] <= 60) & (df.iloc[x-2]["TMIN"] >= 25)) &
                ((df.iloc[x-1]["PRCP"] <= 5.02) & (df.iloc[x-1]["PRCP"] >= 0.01)) &
                ((df.iloc[x-2]["PRCP"] <= 5.02) & (df.iloc[x-2]["PRCP"] >= 0.01))):
                preds.append(True)

                
            else:
                # Predict false if the above is not true
                preds.append(False)
    return preds

In [10]:
# sort index
testing = testing.sort_index()

In [11]:
# add the rain prediction column to the dataframe
testing['preds_rain'] = heuristic_rain(testing)

In [12]:
# show the dataframe
testing

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds_rain
5,1948-01-06,0.44,48,39,True,False
27,1948-01-28,0.00,53,25,False,False
30,1948-01-31,0.21,35,27,True,False
33,1948-02-03,0.20,38,26,True,False
36,1948-02-06,0.00,43,20,False,True
...,...,...,...,...,...,...
25519,2017-11-13,0.81,55,45,True,True
25528,2017-11-22,0.52,68,56,True,True
25530,2017-11-24,0.01,52,46,True,True
25534,2017-11-28,0.68,47,43,True,True


### Regression Heuristic Model

In [13]:
# Create function to perform our heuristic
def heuristic_prcp(df):
    
    """
    Simple heuristic:
    If it rained yesterday and today is raining then will predict a rain precipitation from 0.02 to 0.08
    If If it didn't rain yesterday and today is raining then will predict a rain precipitation from 0.01 to 0.02
    
    """
    
    preds = []
    for x in range(len(df)):
        # If first two rows then predict false
        if x <2:
            preds.append(0)
        else:
            if ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == False)):
                precipitation = round((float(random.randint(100,200)) / 10000),2)
                preds.append(precipitation)
            
            elif ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == True) & ( df["RAIN"].iloc[x-2] == True)):
                precipitation = round((float(random.randint(1000,1500)) / 10000),2)
                preds.append(precipitation)
            
            elif ((df["RAIN"].iloc[x] == True) & (df["RAIN"].iloc[x-1] == True)):
                precipitation = round((float(random.randint(200,800)) / 10000),2)
                preds.append(precipitation)
            
   
            else:
                # Predict 0 if the above is not true
                preds.append(0)
    return preds

In [14]:
# add the prcp prediction column to the dataframe
testing['preds_prcp'] = heuristic_prcp(testing)

In [15]:
# show the dataframe
testing

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds_rain,preds_prcp
5,1948-01-06,0.44,48,39,True,False,0.00
27,1948-01-28,0.00,53,25,False,False,0.00
30,1948-01-31,0.21,35,27,True,False,0.01
33,1948-02-03,0.20,38,26,True,False,0.07
36,1948-02-06,0.00,43,20,False,True,0.00
...,...,...,...,...,...,...,...
25519,2017-11-13,0.81,55,45,True,True,0.12
25528,2017-11-22,0.52,68,56,True,True,0.13
25530,2017-11-24,0.01,52,46,True,True,0.10
25534,2017-11-28,0.68,47,43,True,True,0.10


### Model Evaluation Functions

In [16]:
# create a function that calculate the the Precision and Recall for the classification heuristic model
# for the 'RAIN' column
def classification_model_evaluation(true_val, pred_val):
    # calculate the True Positive, False Positive, False Negative using the confusion_matrix
    # to calculate Precision and Recall
    TP = confusion_matrix(testing['RAIN'], testing['preds_rain'])[1][1]
    FP = confusion_matrix(testing['RAIN'], testing['preds_rain'])[0][1]
    FN = confusion_matrix(testing['RAIN'], testing['preds_rain'])[1][0]
    
    # calculate Precision and Recall
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    return "Precision = {val1} , Recall = {val2} ".format(val1 = precision, val2 = recall)


# create a function that calculate the MSE, MAE and SSE for the regression heuristic model
# for the 'PRCP' column    
def regression_model_evaluation(true_val, pred_val):
    # calculate the MSE, MAE, SSE
    SSE = sum((pred_val - true_val)**2)
    MSE = (1/len(true_val))* SSE
    MAE = (1/len(true_val))* sum(abs(pred_val - true_val))
    return "Sum Of Squared Error = {val1} , Mean Squared Error = {val2}, Mean Absolute Error = {val3} ".format(val1 = SSE, val2 = MSE, val3 = MAE)

In [17]:
# passing the actual values 'RAIN' column and the predicted values 'preds_rain' column
# in classification_model_evaluation to show the Precision and Recall
classification_model_evaluation(testing['RAIN'],testing['preds_rain'])

'Precision = 0.5806182121971596 , Recall = 0.3075221238938053 '

In [18]:
# passing the actual values 'PRCP' column and the predicted values 'preds_prcp' column
# in regression_model_evaluation to show the SSE, MSE and MAE
regression_model_evaluation(testing['PRCP'], testing['preds_prcp'])

'Sum Of Squared Error = 310.52300000000105 , Mean Squared Error = 0.06076771037182017, Mean Absolute Error = 0.09588649706457895 '

In [22]:
# we have run the code 5 times and saved the results
# the Precision results for each run
Precision = [0.5924978687127025 , 0.5641263940520446, 0.581981981981982,
             0.5943877551020408, 0.5806182121971596] 

# the Recall results for each run
Recall = [0.31533575317604357, 0.2836448598130841, 0.29962894248608535, 
          0.3207893529141808, 0.3075221238938053] 

# the SSE results for each run
SSE = [273.46180000000106 , 316.4404000000003, 282.87520000000103,
       374.83390000000134, 310.52300000000105] 

# the MSE results for each run
MSE = [0.05351502935420765, 0.06192571428571435, 0.05535718199608631, 
       0.0733530136986304, 0.06076771037182017] 

# the MAE results for each run
MAE = [0.09019569471624228, 0.08919765166340479, 0.09038747553816023, 
       0.09914872798434399, 0.09588649706457895] 

In [20]:
# create a function to calculate the average of a list
def Average(lst):
    return sum(lst) / len(lst)

In [23]:
# print the results
print("The Precision average score is: ",Average(Precision)), 
print("The Recall average score is: ",Average(Recall)),
print("The SSE average score is: ",Average(SSE)),
print("The MSE average score is: ",Average(MSE)), 
print("The MAE average score is: ",Average(MAE))

The Precision average score is:  0.5827224424091859
The Recall average score is:  0.30538420645663983
The SSE average score is:  311.62686000000093
The MSE average score is:  0.060983729941291774
The MAE average score is:  0.09296320939334604
