# Project

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

In [2]:
df = pd.read_csv('data/weatherAUS.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
df.columns

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')

In [5]:
df["Date"] = pd.to_datetime(df["Date"])   #Converting into Date
df["Year"] = df["Date"].dt.year           #abstracting year in different column 
df["Month"] = df["Date"].dt.month         #abstracting month in diffrent column  
df["Day"] = df["Date"].dt.day

In [6]:
df['Month']

0         12
1         12
2         12
3         12
4         12
          ..
145455     6
145456     6
145457     6
145458     6
145459     6
Name: Month, Length: 145460, dtype: int64

In [7]:
df.drop('Evaporation', axis=1, inplace=True)

In [8]:
df.drop('Date', axis=1, inplace=True)

In [9]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,W,44.0,W,WNW,20.0,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,WNW,44.0,NNW,WSW,4.0,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,WSW,46.0,W,WSW,19.0,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,NE,24.0,SE,E,11.0,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,W,41.0,ENE,NW,7.0,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,2.8,23.4,0.0,,E,31.0,SE,ENE,13.0,...,1020.3,,,10.1,22.4,No,No,2017,6,21
145456,Uluru,3.6,25.3,0.0,,NNW,22.0,SE,N,13.0,...,1019.1,,,10.9,24.5,No,No,2017,6,22
145457,Uluru,5.4,26.9,0.0,,N,37.0,SE,WNW,9.0,...,1016.8,,,12.5,26.1,No,No,2017,6,23
145458,Uluru,7.8,27.0,0.0,,SE,28.0,SSE,N,13.0,...,1016.5,3.0,2.0,15.1,26.0,No,No,2017,6,24


In [10]:
df[(df['Month']==1) & (df['Location']=='Albury')]

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
31,Albury,11.3,26.5,0.0,,WNW,56.0,W,WNW,19.0,...,1003.2,,,19.7,25.7,No,No,2009,1,1
32,Albury,9.6,23.9,0.0,,W,41.0,WSW,SSW,19.0,...,1013.1,,,14.9,22.1,No,No,2009,1,2
33,Albury,10.5,28.8,0.0,,SSE,26.0,SSE,E,11.0,...,1014.8,,,17.1,26.5,No,No,2009,1,3
34,Albury,12.3,34.6,0.0,,WNW,37.0,SSE,NW,6.0,...,1010.3,,,20.7,33.9,No,No,2009,1,4
35,Albury,12.9,35.8,0.0,,WNW,41.0,ENE,NW,6.0,...,1009.2,,,22.4,34.4,No,No,2009,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2890,Albury,17.4,35.9,0.0,,SE,28.0,SE,NNE,9.0,...,1011.5,,1.0,23.5,34.4,No,No,2017,1,27
2891,Albury,17.5,36.9,0.0,,WNW,39.0,SE,W,7.0,...,1011.2,,,23.6,35.7,No,No,2017,1,28
2892,Albury,16.8,38.5,0.0,,WNW,39.0,ESE,W,7.0,...,1012.3,,,24.3,37.1,No,No,2017,1,29
2893,Albury,16.4,42.5,0.0,,WSW,72.0,SSE,W,2.0,...,1004.7,,,23.5,40.9,No,No,2017,1,30


In [11]:
df[(df['Sunshine'].isna()) & (df['Location']=='Albury')]

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,W,44.0,W,WNW,20.0,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,WNW,44.0,NNW,WSW,4.0,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,WSW,46.0,W,WSW,19.0,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,NE,24.0,SE,E,11.0,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,W,41.0,ENE,NW,7.0,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,Albury,1.2,15.2,0.4,,ENE,15.0,,NNE,0.0,...,1026.7,8.0,,2.9,14.3,No,No,2017,6,21
3036,Albury,0.8,13.4,0.0,,W,17.0,S,,6.0,...,1025.9,8.0,1.0,3.6,13.3,No,No,2017,6,22
3037,Albury,1.1,11.9,0.0,,SE,44.0,SSE,SSE,9.0,...,1017.7,8.0,1.0,2.7,10.2,No,No,2017,6,23
3038,Albury,1.1,14.1,0.2,,WSW,28.0,SW,W,4.0,...,1017.2,7.0,6.0,3.9,13.1,No,No,2017,6,24


In [12]:
sunshine = {
    'Albury': {1:12.1, 2:11.2, 3:9.7, 4: 8.1, 5:5.9, 6:4.9, 7: 4.6, 8:5.9, 9:7.5, 10:9.5, 11:10.9, 12:12},
    'Ballarat': {1:10.2, 2:9.3, 3:7.9, 4:6.4, 5:4.6, 6:3.8, 7: 3.9, 8:4.6, 9:5.9, 10:7.2, 11:8.4, 12:9.6},
    'Bendigo': {1:11.9, 2:11.1, 3:9.5, 4:8, 5:5.8, 6:4.9, 7:4.8, 8:5.7, 9:7.2, 10:8.9, 11:10.4, 12:11.6},
    'GoldCoast': {1:9.1, 2:8.8, 3:8.4, 4:8.4, 5:8.3, 6:7.8, 7:8.3, 8:8.8, 9:9.2, 10:9.1, 11:9.5, 12:9.6},
    'Katherine': {1:8.7, 2:8.6, 3:9.1, 4:9.7, 5:9.9, 6:9.9, 7:10.1, 8:10.4, 9:10.7, 10:11, 11:11, 12:10.1},
    'Launceston': {1:9.6, 2:8, 3:6.9, 4:5.6, 5:4.9, 6:4.3, 7:4.4, 8:4.9, 9:5.9, 10:7, 11:7.9, 12:9.2},
    'Newcastle':{1:9.5, 2:9.2, 3:8.6, 4:8.1, 5:8, 6:7.4, 7:7.8, 8:8.6, 9:9.4, 10:9.4, 11:9.7, 12:10.1},
    'Nhil': {1:11.6, 2:10.8, 3:9.1, 4:7.6, 5:5.7, 6:5.1, 7:5.1, 8:6, 9:7, 10:8.5, 11:10, 12:11},
    'Penrith': {1:8.1, 2:7.3, 3:7.1, 4:7.4, 5:7.7, 6:7, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.8},
    'Richmond': {1:8.1, 2:7.4, 3:7.2, 4:7.3, 5:7.6, 6:6.9, 7:7.6, 8:8.4, 9:8.9, 10:8.7, 11:8.5, 12:8.9},
    'SalmonGums': {1:9.9, 2:8.8, 3:7.9, 4:7.4, 5:6.4, 6:6.2, 7:5.8, 8:6.5, 9:7.7, 10:8.5, 11:9.5, 12:10.2},
    'Tuggeranong': {1:9.9, 2:8.6, 3:8, 4:7, 5:6.2, 6:5, 7:5.4, 8:6.4, 9:7.7, 10:8.6, 11:9.3, 12:10.2},
    'Uluru': {1:9.2, 2:8.6, 3:8.6, 4:8.1, 5:7.2, 6:6.3, 7:7.3, 8:8, 9:8.6, 10:9.4, 11:9.1, 12:9.4},
    'Witchcliffe': {1:9.6, 2:9.1, 3:7.9, 4:6.9, 5:6, 6:5.8, 7:5.5, 8:5.8, 9:5.9, 10:6.7, 11:8.6, 12:9.1},
    'Wollongong': {1:8.9, 2:8.3, 3:8, 4:8, 5:8.1, 6:7.4, 7:7.9, 8:8.5, 9:9.1, 10:9.1, 11:9.2, 12:9.5}
}

In [32]:
sunshine['Albury']

{1: 12.1,
 2: 11.2,
 3: 9.7,
 4: 8.1,
 5: 5.9,
 6: 4.9,
 7: 4.6,
 8: 5.9,
 9: 7.5,
 10: 9.5,
 11: 10.9,
 12: 12}

In [48]:
df['Sunshine'] = df['Sunshine'].fillna(sunshine)

In [52]:
sunshine.keys()

dict_keys(['Albury', 'Ballarat', 'Bendigo', 'GoldCoast', 'Katherine', 'Launceston', 'Newcastle', 'Nhil', 'Penrith', 'Richmond', 'SalmonGums', 'Tuggeranong', 'Uluru', 'Witchcliffe', 'Wollongong'])

In [47]:
df

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,W,44.0,W,WNW,20.0,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,WNW,44.0,NNW,WSW,4.0,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,WSW,46.0,W,WSW,19.0,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,NE,24.0,SE,E,11.0,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,W,41.0,ENE,NW,7.0,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,Uluru,2.8,23.4,0.0,,E,31.0,SE,ENE,13.0,...,1020.3,,,10.1,22.4,No,No,2017,6,21
145456,Uluru,3.6,25.3,0.0,,NNW,22.0,SE,N,13.0,...,1019.1,,,10.9,24.5,No,No,2017,6,22
145457,Uluru,5.4,26.9,0.0,,N,37.0,SE,WNW,9.0,...,1016.8,,,12.5,26.1,No,No,2017,6,23
145458,Uluru,7.8,27.0,0.0,,SE,28.0,SSE,N,13.0,...,1016.5,3.0,2.0,15.1,26.0,No,No,2017,6,24


In [56]:
# for null Sunshine in each Location based on Month, change to average values
# for location in df.groupby('location'):
    # if sunshine==nan:
    # replace nan with value in location[key]

for location in df.groupby('Location'):
    if location in sunshine.keys():
        for month, hours in sunshine[location]:
            df['Sunshine'].fillna(hours)
        

TypeError: 'DataFrame' objects are mutable, thus they cannot be hashed

In [14]:
df[df['Location']=='Albury']

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,Year,Month,Day
0,Albury,13.4,22.9,0.6,,W,44.0,W,WNW,20.0,...,1007.1,8.0,,16.9,21.8,No,No,2008,12,1
1,Albury,7.4,25.1,0.0,,WNW,44.0,NNW,WSW,4.0,...,1007.8,,,17.2,24.3,No,No,2008,12,2
2,Albury,12.9,25.7,0.0,,WSW,46.0,W,WSW,19.0,...,1008.7,,2.0,21.0,23.2,No,No,2008,12,3
3,Albury,9.2,28.0,0.0,,NE,24.0,SE,E,11.0,...,1012.8,,,18.1,26.5,No,No,2008,12,4
4,Albury,17.5,32.3,1.0,,W,41.0,ENE,NW,7.0,...,1006.0,7.0,8.0,17.8,29.7,No,No,2008,12,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3035,Albury,1.2,15.2,0.4,,ENE,15.0,,NNE,0.0,...,1026.7,8.0,,2.9,14.3,No,No,2017,6,21
3036,Albury,0.8,13.4,0.0,,W,17.0,S,,6.0,...,1025.9,8.0,1.0,3.6,13.3,No,No,2017,6,22
3037,Albury,1.1,11.9,0.0,,SE,44.0,SSE,SSE,9.0,...,1017.7,8.0,1.0,2.7,10.2,No,No,2017,6,23
3038,Albury,1.1,14.1,0.2,,WSW,28.0,SW,W,4.0,...,1017.2,7.0,6.0,3.9,13.1,No,No,2017,6,24
