In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time
import datetime

# Load Data
df = pd.read_csv('seattle_weather_1948-2017.csv')

In [2]:
df.head()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,1948-01-01,0.47,51,42,True
1,1948-01-02,0.59,45,36,True
2,1948-01-03,0.42,45,35,True
3,1948-01-04,0.31,45,34,True
4,1948-01-05,0.17,45,32,True


### Create functions to handle NaN values

In [3]:
def RAIN_INSERTION(cols):
    """
    Impute False where NaN values are present
    """
    
    RAIN=cols[0]
    if pd.isnull(RAIN):
        return False
    else:
        return RAIN

def PRCP_INSERTION(col):
    """
    Insert the Mean of PRCP where NaN values are present
    """
    PRCP=col[0]
    if pd.isnull(PRCP):
        return df['PRCP'].mean()
    else:
        return PRCP

In [4]:
# Apply the Functions
df["RAIN"] = df[["RAIN"]].apply(RAIN_INSERTION, axis=1) # axis =1 means apply to columns 
df['PRCP']=df[['PRCP']].apply(PRCP_INSERTION,axis=1)


# First heuristic

- If it rained yesterday or last week then it will rain today

In [27]:
def heuristic(df):
    """
    Simple heuristic:
    
    If it rained the past two days then predict rain else predict no rain
    
    Frist two rows are predicted false be default
    """
    
    preds = []
    
    for x in range(len(df)):
        # If first two rows then predict false
        if x<2:
            preds.append(False)
        else:
            # If either of last two days == True then predict True
            if (df.iloc[x-1]["RAIN"] == True) | (df.iloc[x-7]["RAIN"] == True):
                preds.append(True)
            else:
                preds.append(False)
    return preds

In [28]:
df["preds"] = heuristic(df)

In [29]:
df.tail()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
25546,2017-12-10,0.0,49,34,False,True,0.0,0.0,0.0,1.0
25547,2017-12-11,0.0,49,29,False,False,0.0,0.0,0.0,1.0
25548,2017-12-12,0.0,46,32,False,False,0.0,0.0,0.0,1.0
25549,2017-12-13,0.0,48,34,False,False,0.0,0.0,0.0,1.0
25550,2017-12-14,0.0,50,36,False,False,0.0,0.0,0.0,1.0


In [30]:
# Determine Accuracy

# Create function to to find values

def calc_confuse(df):
    
    "Calculate all possible results of a confusion matrix"

    # Hold all possible values and set to zero
    FP = np.zeros(len(df))
    TP = np.zeros(len(df))
    FN = np.zeros(len(df))
    TN = np.zeros(len(df))
    
    for x in range(len(df)):
        
        # True Positive
        if (df["RAIN"].iloc[x] == True) & (df["preds"].iloc[x] == True):
            TP[x] = 1
        # True Negative
        elif (df["RAIN"].iloc[x] == False) & (df["preds"].iloc[x] == False):
            TN[x] = 1
        # False Negative
        elif (df["RAIN"].iloc[x] == True) & (df["preds"].iloc[x] == False):
            FN[x] = 1
        # False Positive
        else:
            FP[x] = 1
    
    return FP, TP, FN, TN

In [31]:
np.zeros(len(df))

array([0., 0., 0., ..., 0., 0., 0.])

In [32]:
# Extract results and create columns for each
w,x,y,z = calc_confuse(df)

df["FP"] = w
df["TP"] = x
df["FN"] = y
df["TN"] = z

# Look at 10 random rows to determin accuracy
df.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
13850,1985-12-02,0.91,42,25,True,True,0.0,1.0,0.0,0.0
2126,1953-10-27,0.0,62,41,False,True,1.0,0.0,0.0,0.0
7517,1968-07-31,0.0,91,59,False,False,0.0,0.0,0.0,1.0
20544,2004-03-31,0.0,51,38,False,True,1.0,0.0,0.0,0.0
8113,1970-03-19,0.0,58,34,False,True,1.0,0.0,0.0,0.0
17735,1996-07-22,0.0,83,58,False,False,0.0,0.0,0.0,1.0
21613,2007-03-05,0.06,59,46,True,True,0.0,1.0,0.0,0.0
4347,1959-11-26,0.0,48,36,False,True,1.0,0.0,0.0,0.0
8869,1972-04-13,0.0,55,40,False,True,1.0,0.0,0.0,0.0
12299,1981-09-03,0.0,73,53,False,False,0.0,0.0,0.0,1.0


In [33]:
# Heuristic Model
(sum(df["TP"]) + sum(df["TN"]))/ len(df)

0.6249853234707057

In [34]:
df["RAIN"].value_counts(normalize=True)

False    0.573402
True     0.426598
Name: RAIN, dtype: float64

# Second Heuristic

- If it TMIN yesterday and last week Equal or less than 30 then it will rain today

In [5]:
def heuristic(df):
    """
    Simple heuristic:
    
    If it rained the past two days then predict rain else predict no rain
    
    Frist two rows are predicted false be default
    """
    
    preds = []
    
    for x in range(len(df)):
        # If first two rows then predict false
        if x<2:
            preds.append(False)
        else:
            # If either of last two days == True then predict True
            if (df.iloc[x-1]["TMIN"] <= 30) & (df.iloc[x-7]["TMIN"] <= 30 ):
                preds.append(True)
            else:
                preds.append(False)
    return preds

In [6]:
df["preds"] = heuristic(df)

In [41]:
df.tail()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
25546,2017-12-10,0.0,49,34,False,False,1.0,0.0,0.0,0.0
25547,2017-12-11,0.0,49,29,False,False,0.0,0.0,0.0,1.0
25548,2017-12-12,0.0,46,32,False,False,0.0,0.0,0.0,1.0
25549,2017-12-13,0.0,48,34,False,False,0.0,0.0,0.0,1.0
25550,2017-12-14,0.0,50,36,False,False,0.0,0.0,0.0,1.0


In [7]:
def calc_confuse(df):
    
    "Calculate all possible results of a confusion matrix"

    # Hold all possible values and set to zero
    FP = np.zeros(len(df))
    TP = np.zeros(len(df))
    FN = np.zeros(len(df))
    TN = np.zeros(len(df))
    
    for x in range(len(df)):
        
        # True Positive
        if (df["TMIN"].iloc[x] <= 30) & (df["preds"].iloc[x] == True):
            TP[x] = 1
        # True Negative
        elif (df["TMIN"].iloc[x] > 30) & (df["preds"].iloc[x] == False):
            TN[x] = 1
        # False Negative
        elif (df["TMIN"].iloc[x] <= 30) & (df["preds"].iloc[x] == False):
            FN[x] = 1
        # False Positive
        else:
            FP[x] = 1
    
    return FP, TP, FN, TN

In [43]:
np.zeros(len(df))

array([0., 0., 0., ..., 0., 0., 0.])

In [8]:
# Extract results and create columns for each
w,x,y,z = calc_confuse(df)

df["FP"] = w
df["TP"] = x
df["FN"] = y
df["TN"] = z

# Look at 10 random rows to determin accuracy
df.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
24443,2014-12-03,0.0,50,32,False,False,0.0,0.0,0.0,1.0
18226,1997-11-25,0.11,51,39,True,False,0.0,0.0,0.0,1.0
21128,2005-11-05,0.79,45,41,True,False,0.0,0.0,0.0,1.0
8817,1972-02-21,0.0,50,36,False,False,0.0,0.0,0.0,1.0
4700,1960-11-13,0.78,50,39,True,False,0.0,0.0,0.0,1.0
974,1950-09-01,0.0,83,51,False,False,0.0,0.0,0.0,1.0
23669,2012-10-20,0.02,52,43,True,False,0.0,0.0,0.0,1.0
18028,1997-05-11,0.0,81,53,False,False,0.0,0.0,0.0,1.0
4375,1959-12-24,0.42,46,38,True,False,0.0,0.0,0.0,1.0
16176,1992-04-15,0.05,65,49,True,False,0.0,0.0,0.0,1.0


In [45]:
# Heuristic Model
(sum(df["TP"]) + sum(df["TN"]))/ len(df)

0.9490430902900082

In [10]:
# Create a Mean Squared Error Function
def mse(y_true, y_pred):
    mse1=np.square(np.subtract(y_true,y_pred)).mean()
    return mse1

#mse(y_test_r, y_pred_r)