# Project: Weather Prediction using Logistic Regression

Predicting whether it will rain tomorrow using today's weather data

## Getting Dataset

In [1]:
## downloaded in file

In [2]:
import pandas as pd

In [3]:
raw_df=pd.read_csv('weatherAUS.csv')

In [4]:
raw_df

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145455,2017-06-21,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
145456,2017-06-22,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
145457,2017-06-23,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No
145458,2017-06-24,Uluru,7.8,27.0,0.0,,,SE,28.0,SSE,...,51.0,24.0,1019.4,1016.5,3.0,2.0,15.1,26.0,No,No


## Identifying input and target columns

In [5]:
input_cols=list(raw_df.columns)[1:-1]  # Excluding last column by range [1:-1]. Python range works as like [ , )
input_cols

['Location',
 'MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustDir',
 'WindGustSpeed',
 'WindDir9am',
 'WindDir3pm',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday']

In [6]:
target_cols=list(raw_df.columns)[-1]
target_cols

'RainTomorrow'

## Data Preprocessing

### Remove row where target columns is empty

In [7]:
raw_df[target_cols].unique()

array(['No', 'Yes', nan], dtype=object)

See there is nan value

In [8]:
raw_df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)

In [9]:
raw_df[target_cols].unique()

array(['No', 'Yes'], dtype=object)

Now there is no none value

### Spliting Dataset

three parts: <br>
<b>Training Set:</b> Train model, compute loss, execute optimization <br>
<b>Validation Set:</b> Pick best verson of model <br>
<b>Test Set:</b> Compare different models <br>

<b>Explaination:</b>

Split raw dataset into <b>traing validation set</b> and <b>test set</b> in ratio 7:3 .<br>
From traing validation set,split into <b>training set</b> and <b>validation set</b> in ration 7:3<br>
Split traing set into <b>traing input set</b> and <b>traning target set</b> <br>

Note: Here,<br>
traing input set is Training Set <br>
validation set is Validation set <br>
test Set is Test Set <br>


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_val_df, test_df = train_test_split(raw_df,test_size=0.3,random_state=42)

In [12]:
train_df, val_df = train_test_split(train_val_df,test_size=0.3,random_state=42)

In [13]:
train_inputs=train_df[input_cols].copy()

In [14]:
train_targets=train_df[target_cols].copy()

### Identify Numeric & Categorical Column

In [15]:
import numpy as np

In [16]:
numeric_cols=train_inputs.select_dtypes(include=np.number).columns.tolist()

In [17]:
categorical_cols=train_inputs.select_dtypes('object').columns.tolist()

### Observing input columns

In [18]:
train_inputs[numeric_cols].describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm
count,68740.0,68846.0,68985.0,39555.0,36105.0,64493.0,68465.0,67733.0,68262.0,67263.0,62151.0,62139.0,43101.0,41473.0,68669.0,67676.0
mean,12.187416,23.21404,2.405229,5.467337,7.636305,39.94235,13.985146,18.597168,68.837904,51.519216,1017.667463,1015.277972,4.430454,4.495937,16.981629,21.669772
std,6.400621,7.13213,8.757592,4.199693,3.780028,13.572923,8.885652,8.772176,19.082213,20.860562,7.132095,7.067183,2.888439,2.72323,6.497755,6.95282
min,-8.2,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,1.0,0.0,982.0,977.1,0.0,0.0,-7.0,-5.1
25%,7.6,17.9,0.0,2.6,4.9,31.0,7.0,13.0,57.0,37.0,1012.9,1010.4,1.0,2.0,12.3,16.6
50%,12.0,22.6,0.0,4.8,8.5,39.0,13.0,19.0,70.0,52.0,1017.7,1015.3,5.0,5.0,16.7,21.1
75%,16.8,28.2,0.8,7.4,10.7,48.0,19.0,24.0,83.0,66.0,1022.4,1020.0,7.0,7.0,21.6,26.4
max,33.9,48.1,367.6,145.0,14.5,135.0,130.0,83.0,100.0,100.0,1040.9,1038.5,9.0,9.0,39.4,46.7


In [19]:
train_inputs[categorical_cols].nunique()  # always use nunique() in categorical column

Location       49
WindGustDir    16
WindDir9am     16
WindDir3pm     16
RainToday       2
dtype: int64

### Cleaning Numeric Columns

#### Imputation

Model can't work with missing numerical data. The process of filling missing values is called imputation.

In [20]:
# Looking is there missing values
train_inputs[numeric_cols].isna().sum()  # isna() shows all missing data

MinTemp            245
MaxTemp            139
Rainfall             0
Evaporation      29430
Sunshine         32880
WindGustSpeed     4492
WindSpeed9am       520
WindSpeed3pm      1252
Humidity9am        723
Humidity3pm       1722
Pressure9am       6834
Pressure3pm       6846
Cloud9am         25884
Cloud3pm         27512
Temp9am            316
Temp3pm           1309
dtype: int64

Yes. There is missing values

In [21]:
from sklearn.impute import SimpleImputer

In [22]:
imputer=SimpleImputer(strategy='mean') # reaplacing by mean value

In [23]:
imputer.fit(raw_df[numeric_cols])  # computing mean value from entire dataset.Beacause traing set, validation set, test set separated now. 
                                   # Stored is statistics_ 
                                   # You can see by imputer.statistics_

SimpleImputer()

In [24]:
train_inputs[numeric_cols]=imputer.transform(train_inputs[numeric_cols])

In [25]:
val_df[numeric_cols]=imputer.transform(val_df[numeric_cols])

In [26]:
test_df[numeric_cols]=imputer.transform(test_df[numeric_cols])

In [27]:
## checking again, is there missing value?
train_inputs[numeric_cols].isna().sum()

MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustSpeed    0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
dtype: int64

Now, There is no missing values <br>

<b>Imputation completed</b>

#### Scaling Values in range 0 to 1

In [28]:
from sklearn.preprocessing import MinMaxScaler

In [29]:
scaler=MinMaxScaler()

In [30]:
scaler.fit(raw_df[numeric_cols])

MinMaxScaler()

now you can see min,max value of all columns by scaler.data_min_ , scaler.data_max_

list(scaler.data_min_)

In [31]:
train_inputs[numeric_cols]=scaler.transform(train_inputs[numeric_cols])

In [32]:
val_df[numeric_cols]=scaler.transform(val_df[numeric_cols])

In [33]:
test_df[numeric_cols]=scaler.transform(test_df[numeric_cols])

Now all valuse scaled. <br>
You can check it by train_inputs[numeric_cols].describe()
<br>
<b>Scaling Done</b>

### Cleaning Categorical Columns

Converting Categorical data into number using encoder <br>
You can see no. of unique value of all columns by nunique()

In [34]:
from sklearn.preprocessing import OneHotEncoder

In [35]:
encoder=OneHotEncoder(sparse=False,handle_unknown='ignore')

In [36]:
encoder.fit(raw_df[categorical_cols].fillna('Unknowns')) # categorical_cols].fillna('Unknowns') replace missing values

OneHotEncoder(handle_unknown='ignore', sparse=False)

You can see: encoder.categories_

In [37]:
encoded_cols=list(encoder.get_feature_names(categorical_cols))  # getting encoded column names



Now we will create new columns in the dataset

In [38]:
train_inputs[encoded_cols]=encoder.transform(train_inputs[categorical_cols])

  self[col] = igetitem(value, i)


In [39]:
val_df[encoded_cols]=encoder.transform(val_df[categorical_cols])

  self[col] = igetitem(value, i)


In [40]:
test_df[encoded_cols]=encoder.transform(test_df[categorical_cols])

  self[col] = igetitem(value, i)


<b> Done </b>
<br>
You can see in the dataset

### Saving Preprocessing Data. Optional

In [41]:
pd.DataFrame(train_inputs).to_csv('train_inputs.csv')

In [42]:
pd.DataFrame(val_df).to_csv('val_df.csv')

In [43]:
pd.DataFrame(test_df).to_csv('test_df.csv')

Saved in file <br>
You can read by pd.read_csv('train_inputs.csv')

## Making & Training Model

In [44]:
from sklearn.linear_model import LogisticRegression 
model=LogisticRegression(solver='liblinear')   #making

In [45]:
model.fit(train_inputs[numeric_cols+encoded_cols],train_targets)  #  training

LogisticRegression(solver='liblinear')

## Making Prediction

In [46]:
X_train=train_inputs[numeric_cols+encoded_cols]

In [47]:
X_val=val_df[numeric_cols+encoded_cols]

In [48]:
X_test=test_df[numeric_cols+encoded_cols]

In [49]:
train_preds=model.predict(X_train)

In [50]:
# and the train target is train_targets

In [51]:
val_preds=model.predict(X_val)

In [52]:
test_preds=model.predict(X_test)

## Testing: Comparing traning prediction with target values

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
accuracy_score(train_targets,train_preds)

0.8524606798579402

In [55]:
# also you can see prediction matrix
from sklearn.metrics import confusion_matrix

In [56]:
confusion_matrix(train_targets,train_preds,normalize='true')

array([[0.94716574, 0.05283426],
       [0.48174895, 0.51825105]])

Do google to know about confusion matrix.<br>

Summering:<br>
Left top value is fraction of 'No' result, which macthed with target value <br>
Right bottom value is fraction of 'Yes' result, which macthed with target value <br>

In [57]:
# lets do of others
accuracy_score(val_df[target_cols],val_preds)

0.8489768307119905

In [58]:
confusion_matrix(val_df[target_cols],val_preds,normalize='true')

array([[0.94815974, 0.05184026],
       [0.4993895 , 0.5006105 ]])

In [59]:
accuracy_score(test_df[target_cols],test_preds)

0.8470061794161517

In [60]:
confusion_matrix(test_df[target_cols],test_preds,normalize='true')

array([[0.94585619, 0.05414381],
       [0.49750451, 0.50249549]])

## Prediction on single input

### Take Input

In [61]:
new_input = {'Date': '2021-06-19',
             'Location': 'Katherine',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

### Preprocess the input

In [62]:
new_input_df=pd.DataFrame([new_input])

In [63]:
new_input_df[numeric_cols]=imputer.transform(new_input_df[numeric_cols]) # imputing

In [64]:
new_input_df[numeric_cols]=scaler.transform(new_input_df[numeric_cols]) # scaling

In [65]:
new_input_df[encoded_cols]=encoder.transform(new_input_df[categorical_cols]) # encoding

  self[col] = igetitem(value, i)


### Predicting

In [66]:
X_new_input=new_input_df[numeric_cols+encoded_cols]

In [67]:
preidiction=model.predict(X_new_input)[0]
preidiction

'Yes'

In [68]:
probability=model.predict_proba(X_new_input)[0]
probability

array([0.31309278, 0.68690722])