## 1. Introduction to Logistic Regression 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

import os 
import warnings

warnings.filterwarnings('ignore')

# Import dataset

In [3]:
df = pd.read_csv('../data/weatherAUS.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


# Exploratory data analysis

In [5]:
df.shape

(259908, 24)

In [8]:
df.isnull().any()

Date             False
Location         False
MinTemp           True
MaxTemp           True
Rainfall          True
Evaporation       True
Sunshine          True
WindGustDir       True
WindGustSpeed     True
WindDir9am        True
WindDir3pm        True
WindSpeed9am      True
WindSpeed3pm      True
Humidity9am       True
Humidity3pm       True
Pressure9am       True
Pressure3pm       True
Cloud9am          True
Cloud3pm          True
Temp9am           True
Temp3pm           True
RainToday         True
RISK_MM           True
RainTomorrow      True
dtype: bool

In [7]:
col_names = df.columns 
col_names

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RISK_MM', 'RainTomorrow'],
      dtype='object')


# Drop RISK_MM variable

It is given in the dataset description, that we should drop the RISK_MM feature variable from the dataset description. So, we should drop it as follows

In [9]:
df.drop(['RISK_MM'], axis=1, inplace=True)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 259908 entries, 0 to 259907
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           259908 non-null  object 
 1   Location       259908 non-null  object 
 2   MinTemp        255308 non-null  float64
 3   MaxTemp        255373 non-null  float64
 4   Rainfall       251949 non-null  float64
 5   Evaporation    111743 non-null  float64
 6   Sunshine       101224 non-null  float64
 7   WindGustDir    240668 non-null  object 
 8   WindGustSpeed  240862 non-null  float64
 9   WindDir9am     239650 non-null  object 
 10  WindDir3pm     248385 non-null  object 
 11  WindSpeed9am   254453 non-null  float64
 12  WindSpeed3pm   250031 non-null  float64
 13  Humidity9am    253840 non-null  float64
 14  Humidity3pm    248930 non-null  float64
 15  Pressure9am    231248 non-null  float64
 16  Pressure3pm    231270 non-null  float64
 17  Cloud9am       142142 non-nul

Types of variables 

In [None]:
categorical = [var for var  in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))

print('The categorical variables are :', categorical)

There are 7 categorical variables

The categorical variables are : ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [13]:
df[categorical].head()

Unnamed: 0,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,W,W,WNW,No,No
1,2008-12-02,Albury,WNW,NNW,WSW,No,No
2,2008-12-03,Albury,WSW,W,WSW,No,No
3,2008-12-04,Albury,NE,SE,E,No,No
4,2008-12-05,Albury,W,ENE,NW,No,No


In [14]:
cat1 = [var for var in categorical if df[var].isnull().sum() != 0]

df[cat1].isnull().sum()

WindGustDir     19240
WindDir9am      20258
WindDir3pm      11523
RainToday        7959
RainTomorrow     7959
dtype: int64

Frequency counts of categorical variables

In [16]:
for var in categorical:
    print(df[var].value_counts())

Date
2020-07-03    49
2020-06-18    49
2020-06-19    49
2020-06-20    49
2020-06-21    49
              ..
2008-01-11     1
2008-01-12     1
2008-01-13     1
2008-01-14     1
2008-01-31     1
Name: count, Length: 5841, dtype: int64
Location
Canberra            5841
Sydney              5749
Hobart              5598
Brisbane            5598
Melbourne           5598
Darwin              5598
Perth               5597
Wollongong          5445
Albury              5445
GoldCoast           5445
Townsville          5445
Cairns              5445
AliceSprings        5445
Launceston          5445
MountGambier        5444
Tuggeranong         5444
Penrith             5444
Albany              5444
Ballarat            5443
Bendigo             5436
MountGinini         5433
Sale                5414
MelbourneAirport    5414
Watsonia            5414
WaggaWagga          5414
CoffsHarbour        5414
Moree               5414
Cobar               5414
Williamtown         5414
NorfolkIsland       5414
Richmond 

In [17]:
for var in categorical:
    print(df[var].value_counts()/np.float64(len(df)))

Date
2020-07-03    0.000189
2020-06-18    0.000189
2020-06-19    0.000189
2020-06-20    0.000189
2020-06-21    0.000189
                ...   
2008-01-11    0.000004
2008-01-12    0.000004
2008-01-13    0.000004
2008-01-14    0.000004
2008-01-31    0.000004
Name: count, Length: 5841, dtype: float64
Location
Canberra            0.022473
Sydney              0.022119
Hobart              0.021538
Brisbane            0.021538
Melbourne           0.021538
Darwin              0.021538
Perth               0.021535
Wollongong          0.020950
Albury              0.020950
GoldCoast           0.020950
Townsville          0.020950
Cairns              0.020950
AliceSprings        0.020950
Launceston          0.020950
MountGambier        0.020946
Tuggeranong         0.020946
Penrith             0.020946
Albany              0.020946
Ballarat            0.020942
Bendigo             0.020915
MountGinini         0.020904
Sale                0.020830
MelbourneAirport    0.020830
Watsonia            0.02