In [38]:
#  implementing a predictive model on Rain Dataset to predict whether or not it will rain tomorrow in Australia

# Dataset contains about 10 years of daily weather observations of different locations in Australia.

#  Problem Statement: Design a predictive model with the use of machine learning algorithms 
#                       to forecast whether or not it will rain tomorrow in Australia.

#  Data Source: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package:

#   Dataset Description:
#    Number of columns: 23
#    Number of rows: 145460
#    Number of Independent Columns: 22
#    Number of Dependent Column: 1



#  Data Preprocessing
#  Finding categorical and Numerical features in Dataset
#  Cardinality check for categorical features
#  Handling Missing values
#  Outlier detection and treatment
#  Exploratory Data Analysis
#  Encoding categorical features
#  Correlation
#  Feature Importance
#  Splitting Data into Training and Testing sets
#  Feature Scaling
#  Model Building and Evaluation
#  Results and Conclusion
#  Save Model and Scaling object with Pickle

In [39]:
#  Importing Necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the Dataset
dataset = 'weatherAUS.csv'
rain = pd.read_csv(dataset)

In [40]:
# Checking the Column Names 
print (rain.head())

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [41]:
# Checking the Dimension of the Dataset - 
#  As we can see from the results that the dataset has 23 variables and 145460 records
print(rain.shape)

(145460, 23)


In [42]:
# Data Preprocessing 
# Real-world data is often messy, incomplete, unstructured, inconsistent, redundant, sprinkled with wacky values. So, without deploying any Data Preprocessing techniques, it is almost impossible to gain insights from raw data.

# What exactly is Data Preprocessing?

#  Data preprocessing is a process of converting raw data to a suitable format to extract insights. 
#  It is the first and foremost step in the Data Science life cycle. 
#  Data Preprocessing makes sure that data is clean, organize and read-to-feed to the Machine Learning model.

# Consise Summary of the Dataset
print(rain.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [43]:
# We can see that except for the Date and The Location columns - every dataset has a missing value 
# We would now be generating "DESCRIPTIVE STATISTICS" of the dataset using the describe () function in pandas 
# The data values that are 'object' will be ommitted through the exclude = object arguement within describe() function
rain.describe(exclude=object).transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MinTemp,143975.0,12.194034,6.398495,-8.5,7.6,12.0,16.9,33.9
MaxTemp,144199.0,23.221348,7.119049,-4.8,17.9,22.6,28.2,48.1
Rainfall,142199.0,2.360918,8.47806,0.0,0.0,0.0,0.8,371.0
Evaporation,82670.0,5.468232,4.193704,0.0,2.6,4.8,7.4,145.0
Sunshine,75625.0,7.611178,3.785483,0.0,4.8,8.4,10.6,14.5
WindGustSpeed,135197.0,40.03523,13.607062,6.0,31.0,39.0,48.0,135.0
WindSpeed9am,143693.0,14.043426,8.915375,0.0,7.0,13.0,19.0,130.0
WindSpeed3pm,142398.0,18.662657,8.8098,0.0,13.0,19.0,24.0,87.0
Humidity9am,142806.0,68.880831,19.029164,0.0,57.0,70.0,83.0,100.0
Humidity3pm,140953.0,51.539116,20.795902,0.0,37.0,52.0,66.0,100.0


In [44]:
rain.describe(include=object).transpose()

Unnamed: 0,count,unique,top,freq
Date,145460,3436,2013-07-18,49
Location,145460,49,Canberra,3436
WindGustDir,135134,16,W,9915
WindDir9am,134894,16,N,11758
WindDir3pm,141232,16,SE,10838
RainToday,142199,2,No,110319
RainTomorrow,142193,2,No,110316


In [45]:
# Finding Categorical and Numerical Features in the Dataset

# Categorical Features in the Dataset

categorical_features = [column_name for column_name in rain.columns if rain[column_name].dtype == 'O']

print("Number of Categorical Features: {}".format(len(categorical_features))) # Number of Categorical Features
print("Categorical Features:", categorical_features) # Names of Categorical Features

Number of Categorical Features: 7
Categorical Features: ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [46]:
# Numerical Feature in the Dataset

numerical_features = [column_name for column_name in rain.columns if rain[column_name].dtype !='O']
print("Number of Numerical Features: {}".format(len(numerical_features)))
print("Numerical Features: ",numerical_features)

Number of Numerical Features: 16
Numerical Features:  ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm']


In [47]:
# Cardinality check for Categorical features:

# The accuracy, performance of a classifier not only depends on the model that we use, 
# but also depends on how we preprocess data, and what kind of data you’re feeding to the classifier to learn.

# Many Machine learning algorithms like Linear Regression, Logistic Regression, k-nearest neighbors, etc. 
# can handle only numerical data, so encoding categorical data to numeric becomes a necessary step.  
# But before jumping into encoding, check the cardinality of each categorical feature.


# Cardinality: The number of unique values in each categorical feature is known as cardinality.


# A feature with a high number of distinct/ unique values is a high cardinality feature. 

# A categorical feature with hundreds of zip codes is the best example of a high cardinality feature.
#    This high cardinality feature poses many serious problems: 
#    Like it will increase the number of dimensions of data when that feature is encoded, which is not good for the model.
#    There are many ways to handle high cardinality, one would be feature engineering
#    The other is simply dropping that feature if it doesn’t add any value to the model.
#    Let’s find the cardinality for Categorical features:

In [48]:
for each_feature in categorical_features:
    unique_values = len(rain[each_feature].unique())
    print ("Cardinality(no. of unique values) of {} are: {}".format(each_feature, unique_values))

Cardinality(no. of unique values) of Date are: 3436
Cardinality(no. of unique values) of Location are: 49
Cardinality(no. of unique values) of WindGustDir are: 17
Cardinality(no. of unique values) of WindDir9am are: 17
Cardinality(no. of unique values) of WindDir3pm are: 17
Cardinality(no. of unique values) of RainToday are: 3
Cardinality(no. of unique values) of RainTomorrow are: 3


In [49]:
# Date column has high cardinality which poses several problems to the model in terms of efficiency 
#   and also dimensions of data increase when encoded to numerical data.

In [50]:
# Feature Engineering of Date Column to decrease high Cardinality

rain['Date'] = pd.to_datetime(rain['Date'])

rain['year'] = rain['Date'].dt.year
rain['month'] = rain['Date'].dt.month
rain['day'] = rain['Date'].dt.day

# Drop Date Column

rain.drop('month', axis = 1, inplace = True) # axis =1 (refers to column) 

rain.head()


Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow,year,day
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,1007.7,1007.1,8.0,,16.9,21.8,No,No,2008,1
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,1010.6,1007.8,,,17.2,24.3,No,No,2008,2
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,1007.6,1008.7,,2.0,21.0,23.2,No,No,2008,3
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,1017.6,1012.8,,,18.1,26.5,No,No,2008,4
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No,2008,5


In [51]:
# Handling Missing Values in Categorical Features 

categorical_features = [column_name for column_name in rain.columns if rain[column_name].dtype == 'O']

rain[categorical_features].isnull().sum()



Location            0
WindGustDir     10326
WindDir9am      10566
WindDir3pm       4228
RainToday        3261
RainTomorrow     3267
dtype: int64

In [52]:
# Imputing the missing values in Categorical Variables Using the most frequent Value (MODE)
categorical_features_with_null = [feature for feature in categorical_features if rain[feature].isnull().sum()]


for each_feature in categorical_features_with_null:
    mode_val = rain[each_feature].mode()[0]
    rain[each_feature].fillna(mode_val, inplace = True)

In [53]:
rain.Location.value_counts()

Canberra            3436
Sydney              3344
Brisbane            3193
Hobart              3193
Darwin              3193
Melbourne           3193
Perth               3193
Adelaide            3193
Bendigo             3040
Townsville          3040
Albury              3040
MountGambier        3040
GoldCoast           3040
Wollongong          3040
Cairns              3040
Ballarat            3040
MountGinini         3040
Albany              3040
AliceSprings        3040
Launceston          3040
Penrith             3039
Newcastle           3039
Tuggeranong         3039
Mildura             3009
NorfolkIsland       3009
Watsonia            3009
WaggaWagga          3009
Moree               3009
CoffsHarbour        3009
Richmond            3009
MelbourneAirport    3009
Williamtown         3009
Witchcliffe         3009
SydneyAirport       3009
Nuriootpa           3009
PearceRAAF          3009
PerthAirport        3009
BadgerysCreek       3009
Woomera             3009
Portland            3009


In [54]:
# Handling Missing Values in Numerical Features

numerical_features = [column_name for column_name in rain.columns if rain[column_name].dtype != 'O']
rain[numerical_features].isnull().sum()

Date                 0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustSpeed    10263
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
year                 0
day                  0
dtype: int64

In [55]:
# Missing Value in the Numerical Features can be Imputed as Mean or Median.
# However, for this the outliers have to be addressed. 

In [63]:
# OUTLIER TREATMENT WITHIN NUMERICAL FEATURES


features_with_outliers = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustSpeed','WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 
                          'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']

# Using the Inter Quartile Range Rule to Address Outliers


for feature in features_with_outliers:
    q1 = rain[feature].quantile(0.25) #Calculating 1st Quartile / 25th percentile of the feature values
    q3 = rain[feature].quantile(0.75) #Calculating 3rd Quatile / 75th percentile of the feature values
    IQR = q3-q1 #Calculating Inter Quartile Range - Shows how the data data is spread across the Median value
    
    # Multiply IQR by 1.5 - which is the constant to discern outliers 
    lower_limit = q1-(IQR*1.5) #  Any number less than this is an outlier
    upper_limit = q3+(IQR*1.5) #  Any number more than this is an outlier

    #Extracts rows of data which are lower than lower_limit and assigns them the value of lower limit
    rain.loc[rain[feature]<lower_limit, feature] = lower_limit 
    
    #Extracts rows of data which are higher than upper_limit and assigns them the value of upper limit
    rain.loc[rain[feature]>upper_limit, feature] = upper_limit
    

In [62]:
# The Numerical Values are free from Outliers - Imputing the misisng vaues with numerical Mean

numerical_features_with_null = [feature for feature in numerical_features if rain[feature].isnull().sum()]

for feature in numerical_features_with_null:
    mean_value = rain[feature].mean()
    
    rain[feature].fillna(mean_value, inplace = True)

In [64]:
# Its now time to do some Exploratory Data Analysis