# Project

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import classification_report, plot_confusion_matrix

In [2]:
df = pd.read_csv('data/weatherAUS.csv')

In [3]:
df["Date"] = pd.to_datetime(df["Date"])   # Converting into DateTime
df["Month"] = df["Date"].dt.month         # Abstracting month in new column 

In [4]:
df.drop('Evaporation', axis=1, inplace=True)    # Dropping 'Evaporation' because there are too many null values
df.drop('Date', axis=1, inplace=True)           # Dropping 'Date' because we will only use 'Month'

In [5]:
sunshine = {
    'Albury': {1:12.1, 2:11.2, 3:9.7, 4: 8.1, 5:5.9, 6:4.9, 7: 4.6, 8:5.9, 9:7.5, 10:9.5, 11:10.9, 12:12},
    'Ballarat': {1:10.2, 2:9.3, 3:7.9, 4:6.4, 5:4.6, 6:3.8, 7: 3.9, 8:4.6, 9:5.9, 10:7.2, 11:8.4, 12:9.6},
    'Bendigo': {1:11.9, 2:11.1, 3:9.5, 4:8, 5:5.8, 6:4.9, 7:4.8, 8:5.7, 9:7.2, 10:8.9, 11:10.4, 12:11.6},
    'GoldCoast': {1:9.1, 2:8.8, 3:8.4, 4:8.4, 5:8.3, 6:7.8, 7:8.3, 8:8.8, 9:9.2, 10:9.1, 11:9.5, 12:9.6},
    'Katherine': {1:8.7, 2:8.6, 3:9.1, 4:9.7, 5:9.9, 6:9.9, 7:10.1, 8:10.4, 9:10.7, 10:11, 11:11, 12:10.1},
    'Launceston': {1:9.6, 2:8, 3:6.9, 4:5.6, 5:4.9, 6:4.3, 7:4.4, 8:4.9, 9:5.9, 10:7, 11:7.9, 12:9.2},
    'Newcastle': {1:9.5, 2:9.2, 3:8.6, 4:8.1, 5:8, 6:7.4, 7:7.8, 8:8.6, 9:9.4, 10:9.4, 11:9.7, 12:10.1},
    'Nhil': {1:11.6, 2:10.8, 3:9.1, 4:7.6, 5:5.7, 6:5.1, 7:5.1, 8:6, 9:7, 10:8.5, 11:10, 12:11},
    'Penrith': {1:8.1, 2:7.3, 3:7.1, 4:7.4, 5:7.7, 6:7, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.8},
    'Richmond': {1:8.1, 2:7.4, 3:7.2, 4:7.3, 5:7.6, 6:6.9, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.9},
    'SalmonGums': {1:9.9, 2:8.8, 3:7.9, 4:7.4, 5:6.4, 6:6.2, 7:5.8, 8:6.5, 9:7.7, 10:8.5, 11:9.5, 12:10.2},
    'Tuggeranong': {1:9.9, 2:8.6, 3:8, 4:7, 5:6.2, 6:5, 7:5.4, 8:6.4, 9:7.7, 10:8.6, 11:9.3, 12:10.2},
    'Uluru': {1:9.2, 2:8.6, 3:8.6, 4:8.1, 5:7.2, 6:6.3, 7:7.3, 8:8, 9:8.6, 10:9.4, 11:9.1, 12:9.4},
    'Witchcliffe': {1:9.6, 2:9.1, 3:7.9, 4:6.9, 5:6, 6:5.8, 7:5.5, 8:5.8, 9:5.9, 10:6.7, 11:8.6, 12:9.1},
    'Wollongong': {1:8.9, 2:8.3, 3:8, 4:8, 5:8.1, 6:7.4, 7:7.9, 8:8.5, 9:9.1, 10:9.1, 11:9.2, 12:9.5}
}

albury = {1:12.1, 2:11.2, 3:9.7, 4: 8.1, 5:5.9, 6:4.9, 7: 4.6, 8:5.9, 9:7.5, 10:9.5, 11:10.9, 12:12}
ballarat = {1:10.2, 2:9.3, 3:7.9, 4:6.4, 5:4.6, 6:3.8, 7: 3.9, 8:4.6, 9:5.9, 10:7.2, 11:8.4, 12:9.6}
bendigo = {1:11.9, 2:11.1, 3:9.5, 4:8, 5:5.8, 6:4.9, 7:4.8, 8:5.7, 9:7.2, 10:8.9, 11:10.4, 12:11.6}
goldcoast = {1:9.1, 2:8.8, 3:8.4, 4:8.4, 5:8.3, 6:7.8, 7:8.3, 8:8.8, 9:9.2, 10:9.1, 11:9.5, 12:9.6}
katherine = {1:8.7, 2:8.6, 3:9.1, 4:9.7, 5:9.9, 6:9.9, 7:10.1, 8:10.4, 9:10.7, 10:11, 11:11, 12:10.1}
launceston = {1:9.6, 2:8, 3:6.9, 4:5.6, 5:4.9, 6:4.3, 7:4.4, 8:4.9, 9:5.9, 10:7, 11:7.9, 12:9.2}
newcastle = {1:9.5, 2:9.2, 3:8.6, 4:8.1, 5:8, 6:7.4, 7:7.8, 8:8.6, 9:9.4, 10:9.4, 11:9.7, 12:10.1}
nhil = {1:11.6, 2:10.8, 3:9.1, 4:7.6, 5:5.7, 6:5.1, 7:5.1, 8:6, 9:7, 10:8.5, 11:10, 12:11}
penrith = {1:8.1, 2:7.3, 3:7.1, 4:7.4, 5:7.7, 6:7, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.8}
richmond = {1:8.1, 2:7.4, 3:7.2, 4:7.3, 5:7.6, 6:6.9, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.9}
salmongums = {1:9.9, 2:8.8, 3:7.9, 4:7.4, 5:6.4, 6:6.2, 7:5.8, 8:6.5, 9:7.7, 10:8.5, 11:9.5, 12:10.2}
tuggeranong = {1:9.9, 2:8.6, 3:8, 4:7, 5:6.2, 6:5, 7:5.4, 8:6.4, 9:7.7, 10:8.6, 11:9.3, 12:10.2}
uluru = {1:9.2, 2:8.6, 3:8.6, 4:8.1, 5:7.2, 6:6.3, 7:7.3, 8:8, 9:8.6, 10:9.4, 11:9.1, 12:9.4}
witchcliffe = {1:9.6, 2:9.1, 3:7.9, 4:6.9, 5:6, 6:5.8, 7:5.5, 8:5.8, 9:5.9, 10:6.7, 11:8.6, 12:9.1}
wollongong = {1:8.9, 2:8.3, 3:8, 4:8, 5:8.1, 6:7.4, 7:7.9, 8:8.5, 9:9.1, 10:9.1, 11:9.2, 12:9.5}

In [6]:
# for null Sunshine in each Location based on Month, change to average values
# for location in df.groupby('location'):
    # if sunshine==nan:
    # replace nan with value in sunshine[location][month]
    
    
    # df[(df['Location']=='Albury') & (df['Sunshine'].isna())]

# groupby location & month, change sunshine null to average based on month

#if list(df['Sunshine'].isna()):
    #df.groupby(['Location'])
    #for loc, sun in sunshine.items():
        #for month, hours in sun.items():
            #df['Sunshine'] = df['Month'].map(sunshine)

#if 'Albury' in df['Location']:
    #if list(df['Sunshine'].isna()):
        #df['Sunshine'] = df['Month'].map(albury)

In [35]:
df.loc[df['Location'].isin(sunshine.keys()), 'Sunshine'] = df['Month'].map(sunshine)

In [36]:
df[df['Location']=='Albury']

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Month
0,Albury,13.4,22.9,0.6,,W,44.0,W,WNW,20.0,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No,12
1,Albury,7.4,25.1,0.0,,WNW,44.0,NNW,WSW,4.0,...,25.0,1010.6,1007.8,,,17.2,24.3,No,No,12
2,Albury,12.9,25.7,0.0,,WSW,46.0,W,WSW,19.0,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No,12
3,Albury,9.2,28.0,0.0,,NE,24.0,SE,E,11.0,...,16.0,1017.6,1012.8,,,18.1,26.5,No,No,12
4,Albury,17.5,32.3,1.0,,W,41.0,ENE,NW,7.0,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,Albury,1.2,15.2,0.4,,ENE,15.0,,NNE,0.0,...,62.0,1029.4,1026.7,8.0,,2.9,14.3,No,No,6
3036,Albury,0.8,13.4,0.0,,W,17.0,S,,6.0,...,66.0,1029.4,1025.9,8.0,1.0,3.6,13.3,No,No,6
3037,Albury,1.1,11.9,0.0,,SE,44.0,SSE,SSE,9.0,...,81.0,1022.3,1017.7,8.0,1.0,2.7,10.2,No,No,6
3038,Albury,1.1,14.1,0.2,,WSW,28.0,SW,W,4.0,...,49.0,1018.8,1017.2,7.0,6.0,3.9,13.1,No,No,6


In [None]:
df[df['Location']=='Bendigo']

In [None]:
if 'Ballarat' in df['Location']:
    if list(df['Sunshine'].isna()):
        df['Sunshine'] = df['Month'].map(ballarat)

In [None]:
if list(df['Sunshine'].isna()):
    df['Sunshine'] = df['Month'].map(albury)

In [None]:
df.groupby(['Location', 'Month'])

In [None]:
for location in df.groupby('Location'):
    if (location == 'Albury') and df['Sunshine'].isna():
        df['Sunshine'].fillna(albury, inplace=True)