In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import time
import datetime

# Load Data
df = pd.read_csv('seattle_weather_1948-2017.csv')

### Create functions to handle NaN values

In [2]:
def RAIN_INSERTION(cols):
    """
    Impute False where NaN values are present
    """
    
    RAIN=cols[0]
    if pd.isnull(RAIN):
        return False
    else:
        return RAIN

def PRCP_INSERTION(col):
    """
    Insert the Mean of PRCP where NaN values are present
    """
    PRCP=col[0]
    if pd.isnull(PRCP):
        return df['PRCP'].mean()
    else:
        return PRCP

In [3]:
# Apply the Functions
df["RAIN"] = df[["RAIN"]].apply(RAIN_INSERTION, axis=1) # axis =1 means apply to columns 
df['PRCP']=df[['PRCP']].apply(PRCP_INSERTION,axis=1)

## The Hueristic

- If it rained yesterday or last week then it will rain today

In [12]:
def heuristic(df):
    """
    Simple heuristic:
    
    If it rained the past two days then predict rain else predict no rain
    
    Frist two rows are predicted false be default
    """
    
    preds = []
    
    for x in range(len(df)):
        # If first two rows then predict false
        if x<2:
            preds.append(False)
        else:
            # If either of last two days == True then predict True
            if (df.iloc[x-1]["TMIN"] <= 40) | (df.iloc[x-7]["TMIN"] <= True):
                preds.append(True)
            else:
                preds.append(False)
    return preds

In [13]:
df["preds"] = heuristic(df)

In [14]:
df.tail()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
25546,2017-12-10,0.0,49,34,False,True,1.0,0.0,0.0,0.0
25547,2017-12-11,0.0,49,29,False,True,0.0,0.0,0.0,1.0
25548,2017-12-12,0.0,46,32,False,True,0.0,0.0,0.0,1.0
25549,2017-12-13,0.0,48,34,False,True,0.0,0.0,0.0,1.0
25550,2017-12-14,0.0,50,36,False,True,0.0,0.0,0.0,1.0


In [15]:
# Determine Accuracy

# Create function to to find values

def calc_confuse(df):
    
    "Calculate all possible results of a confusion matrix"

    # Hold all possible values and set to zero
    FP = np.zeros(len(df))
    TP = np.zeros(len(df))
    FN = np.zeros(len(df))
    TN = np.zeros(len(df))
    
    for x in range(len(df)):
        
        # True Positive
        if (df["TMIN"].iloc[x] <= 40) & (df["preds"].iloc[x] == True):
            TP[x] = 1
        # True Negative
        elif (df["TMIN"].iloc[x] > 40) & (df["preds"].iloc[x] == False):
            TN[x] = 1
        # False Negative
        elif (df["TMIN"].iloc[x] <= 40) & (df["preds"].iloc[x] == False):
            FN[x] = 1
        # False Positive
        else:
            FP[x] = 1
    
    return FP, TP, FN, TN

In [16]:
# Extract results and create columns for each
w,x,y,z = calc_confuse(df)

df["FP"] = w
df["TP"] = x
df["FN"] = y
df["TN"] = z

# Look at 10 random rows to determin accuracy
df.sample(10)

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN,preds,FP,TP,FN,TN
1452,1951-12-23,0.0,38,26,False,True,0.0,1.0,0.0,0.0
24361,2014-09-12,0.0,76,55,False,False,0.0,0.0,0.0,1.0
22985,2010-12-06,0.01,49,38,True,True,0.0,1.0,0.0,0.0
2477,1954-10-13,0.0,55,35,False,False,0.0,0.0,1.0,0.0
1335,1951-08-28,0.2,57,48,True,False,0.0,0.0,0.0,1.0
8973,1972-07-26,0.0,73,55,False,False,0.0,0.0,0.0,1.0
19087,2000-04-04,0.0,54,39,False,False,0.0,0.0,1.0,0.0
18171,1997-10-01,0.5,62,51,True,False,0.0,0.0,0.0,1.0
8575,1971-06-24,0.86,54,47,True,False,0.0,0.0,0.0,1.0
14687,1988-03-18,0.0,66,36,False,True,0.0,1.0,0.0,0.0


In [17]:
# Heuristic Model
(sum(df["TP"]) + sum(df["TN"]))/ len(df)

0.8734687487769559

In [18]:
df["RAIN"].value_counts(normalize=True)

False    0.573402
True     0.426598
Name: RAIN, dtype: float64