In [1]:
#  implementing a predictive model on Rain Dataset to predict whether or not it will rain tomorrow in Australia

# Dataset contains about 10 years of daily weather observations of different locations in Australia.

#  Problem Statement: Design a predictive model with the use of machine learning algorithms 
#                       to forecast whether or not it will rain tomorrow in Australia.

#  Data Source: https://www.kaggle.com/jsphyg/weather-dataset-rattle-package:

#   Dataset Description:
#    Number of columns: 23
#    Number of rows: 145460
#    Number of Independent Columns: 22
#    Number of Dependent Column: 1



#  Data Preprocessing
#  Finding categorical and Numerical features in Dataset
#  Cardinality check for categorical features
#  Handling Missing values
#  Outlier detection and treatment
#  Exploratory Data Analysis
#  Encoding categorical features
#  Correlation
#  Feature Importance
#  Splitting Data into Training and Testing sets
#  Feature Scaling
#  Model Building and Evaluation
#  Results and Conclusion
#  Save Model and Scaling object with Pickle

In [7]:
#  Importing Necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Loading the Dataset
dataset = 'weatherAUS.csv'
rain = pd.read_csv(dataset)

In [9]:
# Checking the Column Names 
print (rain.head())

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [15]:
# Checking the Dimension of the Dataset - 
#  As we can see from the results that the dataset has 23 variables and 145460 records
print(rain.shape)

(145460, 23)


In [17]:
# Data Preprocessing 
# Real-world data is often messy, incomplete, unstructured, inconsistent, redundant, sprinkled with wacky values. So, without deploying any Data Preprocessing techniques, it is almost impossible to gain insights from raw data.

# What exactly is Data Preprocessing?

#  Data preprocessing is a process of converting raw data to a suitable format to extract insights. 
#  It is the first and foremost step in the Data Science life cycle. 
#  Data Preprocessing makes sure that data is clean, organize and read-to-feed to the Machine Learning model.

# Consise Summary of the Dataset
print(rain.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [26]:
# We can see that except for the Date and The Location columns - every dataset has a missing value 
# We would now be generating "DESCRIPTIVE STATISTICS" of the dataset using the describe () function in pandas 
# The data values that are 'object' will be ommitted through the exclude = object arguement within describe() function
rain.describe(exclude=object).transpose()


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MinTemp,143975.0,12.194034,6.398495,-8.5,7.6,12.0,16.9,33.9
MaxTemp,144199.0,23.221348,7.119049,-4.8,17.9,22.6,28.2,48.1
Rainfall,142199.0,2.360918,8.47806,0.0,0.0,0.0,0.8,371.0
Evaporation,82670.0,5.468232,4.193704,0.0,2.6,4.8,7.4,145.0
Sunshine,75625.0,7.611178,3.785483,0.0,4.8,8.4,10.6,14.5
WindGustSpeed,135197.0,40.03523,13.607062,6.0,31.0,39.0,48.0,135.0
WindSpeed9am,143693.0,14.043426,8.915375,0.0,7.0,13.0,19.0,130.0
WindSpeed3pm,142398.0,18.662657,8.8098,0.0,13.0,19.0,24.0,87.0
Humidity9am,142806.0,68.880831,19.029164,0.0,57.0,70.0,83.0,100.0
Humidity3pm,140953.0,51.539116,20.795902,0.0,37.0,52.0,66.0,100.0


In [28]:
rain.describe(include=object).transpose()

Unnamed: 0,count,unique,top,freq
Date,145460,3436,2017-02-20,49
Location,145460,49,Canberra,3436
WindGustDir,135134,16,W,9915
WindDir9am,134894,16,N,11758
WindDir3pm,141232,16,SE,10838
RainToday,142199,2,No,110319
RainTomorrow,142193,2,No,110316


In [32]:
# Finding Categorical and Numerical Features in the Dataset

# Categorical Features in the Dataset

categorical_features = [column_name for column_name in rain.columns if rain[column_name].dtype == 'O']

print("Number of Categorical Features: {}".format(len(categorical_features))) # Number of Categorical Features
print("Categorical Features:", categorical_features) # Names of Categorical Features

Number of Categorical Features: 7
Categorical Features: ['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']
