# Prediction of RainFall

### Importing necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


### importing data

In [2]:
data=pd.read_csv("rain.csv")
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,01-12-2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,02-12-2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,03-12-2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,04-12-2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,05-12-2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142188,20-06-2017,Uluru,3.5,21.8,0.0,,,E,31.0,ESE,...,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,No
142189,21-06-2017,Uluru,2.8,23.4,0.0,,,E,31.0,SE,...,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,No
142190,22-06-2017,Uluru,3.6,25.3,0.0,,,NNW,22.0,SE,...,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,No
142191,23-06-2017,Uluru,5.4,26.9,0.0,,,N,37.0,SE,...,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,No


### Describing data

In [3]:
print(data.columns)
print(data.info())
print(data.describe())

Index(['Date', 'Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 n

### Feature Engineering

        The aim of feature engineering is to prepare an input data set that best fits the machine learning algorithm as well as to enhance the performance of machine learning models.

In [4]:
dataNew=data[["MinTemp","MaxTemp","WindDir3pm","WindSpeed9am","Humidity9am","Pressure9am","RainToday","RainTomorrow"]].copy()
dataNew.columns=["MinTemp","MaxTemp","WindDir","WindSpeed","Humidity","Pressure","RainToday","RainTomorrow"]
dataNew

Unnamed: 0,MinTemp,MaxTemp,WindDir,WindSpeed,Humidity,Pressure,RainToday,RainTomorrow
0,13.4,22.9,WNW,20.0,71.0,1007.7,No,No
1,7.4,25.1,WSW,4.0,44.0,1010.6,No,No
2,12.9,25.7,WSW,19.0,38.0,1007.6,No,No
3,9.2,28.0,E,11.0,45.0,1017.6,No,No
4,17.5,32.3,NW,7.0,82.0,1010.8,No,No
...,...,...,...,...,...,...,...,...
142188,3.5,21.8,E,15.0,59.0,1024.7,No,No
142189,2.8,23.4,ENE,13.0,51.0,1024.6,No,No
142190,3.6,25.3,N,13.0,56.0,1023.5,No,No
142191,5.4,26.9,WNW,9.0,53.0,1021.0,No,No


### null synthesis

In [25]:
dataNew.isnull().sum()


MinTemp         0
MaxTemp         0
WindDir         0
WindSpeed       0
Humidity        0
Pressure        0
RainToday       0
RainTomorrow    0
dtype: int64

In [26]:
dataNew.dropna(inplace=True)

In [7]:
dataNew.isnull().sum()

MinTemp         0
MaxTemp         0
WindDir         0
WindSpeed       0
Humidity        0
Pressure        0
RainToday       0
RainTomorrow    0
dtype: int64

#### inputs and outputs

In [8]:
x=dataNew.iloc[:,:-1]
y=dataNew.iloc[:,-1]

In [9]:
x,y

(        MinTemp  MaxTemp WindDir  WindSpeed  Humidity  Pressure RainToday
 0          13.4     22.9     WNW       20.0      71.0    1007.7        No
 1           7.4     25.1     WSW        4.0      44.0    1010.6        No
 2          12.9     25.7     WSW       19.0      38.0    1007.6        No
 3           9.2     28.0       E       11.0      45.0    1017.6        No
 4          17.5     32.3      NW        7.0      82.0    1010.8        No
 ...         ...      ...     ...        ...       ...       ...       ...
 142188      3.5     21.8       E       15.0      59.0    1024.7        No
 142189      2.8     23.4     ENE       13.0      51.0    1024.6        No
 142190      3.6     25.3       N       13.0      56.0    1023.5        No
 142191      5.4     26.9     WNW        9.0      53.0    1021.0        No
 142192      7.8     27.0       N       13.0      51.0    1019.4        No
 
 [124205 rows x 7 columns],
 0         No
 1         No
 2         No
 3         No
 4         No


#### catogorical to numericals

In [10]:
from sklearn.preprocessing import LabelEncoder
l1=LabelEncoder()
l2=LabelEncoder()
l3=LabelEncoder()
x["WindDir"]=l1.fit_transform(x["WindDir"])
x["RainToday"]=l2.fit_transform(x["RainToday"])
y= l3.fit_transform(y)

In [11]:
k=0
print("Label encoding for WindDir")
for i in l1.classes_:
    print(k,"-->",i)
    k+=1
k=0
print("label encoding for Raintoday")
for i in l2.classes_:
    print(k,"-->",i)
    k+=1
k=0
print("label encoding for Raintomorrow")
for i in l3.classes_:
    print(k,"-->",i)
    k+=1

Label encoding for WindDir
0 --> E
1 --> ENE
2 --> ESE
3 --> N
4 --> NE
5 --> NNE
6 --> NNW
7 --> NW
8 --> S
9 --> SE
10 --> SSE
11 --> SSW
12 --> SW
13 --> W
14 --> WNW
15 --> WSW
label encoding for Raintoday
0 --> No
1 --> Yes
label encoding for Raintomorrow
0 --> No
1 --> Yes


### data Segregation

In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)

In [13]:
x_train.shape,x_test.shape

((93153, 7), (31052, 7))

## using Extra trees classifier

In [14]:
from sklearn.ensemble import ExtraTreesClassifier
model=ExtraTreesClassifier(max_depth=23,n_estimators=100,min_samples_split=2,criterion="entropy",class_weight="balanced",max_features='sqrt')
model.fit(x_train,y_train)

ExtraTreesClassifier(class_weight='balanced', criterion='entropy', max_depth=23,
                     max_features='sqrt')

#### predicting the test results

In [15]:
y_pred=model.predict(x_test)

#### visualize classfication report

In [16]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88     24180
           1       0.58      0.51      0.54      6872

    accuracy                           0.81     31052
   macro avg       0.72      0.70      0.71     31052
weighted avg       0.80      0.81      0.80     31052



#### visualize confusion matrix

In [17]:
print(pd.DataFrame(metrics.confusion_matrix(y_test,y_pred),columns=(0,1)))

       0     1
0  21588  2592
1   3364  3508


#### accuracy score

In [18]:
print(metrics.accuracy_score(y_test,y_pred)*100)

80.8192709004251


#### feature importances on output

In [19]:
pd.Series(model.feature_importances_,index=x.columns)

MinTemp      0.150469
MaxTemp      0.158771
WindDir      0.099614
WindSpeed    0.106114
Humidity     0.178205
Pressure     0.198687
RainToday    0.108140
dtype: float64

#### predict with real world data

In [20]:
a1=float(input("Enter the value for MinTemp:"))
a2=float(input("Enter the value for MaxTemp:"))
a3=input("Enter the value for WindDirection{}:".format(l1.classes_))
a3=l1.transform([a3])[0]
a4=float(input("Enter the value for WindSpeed:"))
a5=float(input("Enter the value for Humidity:"))
a6=float(input("Enter the value for Pressure:"))
a7=input("Enter value for RainToday{}:".format(l2.classes_))
a7=l2.transform([a7])[0]
datapoint=pd.DataFrame(np.array([[a1,a2,a3,a4,a5,a6,a7]]),columns=x.columns)
datapoint

Enter the value for MinTemp:21
Enter the value for MaxTemp:35
Enter the value for WindDirection['E' 'ENE' 'ESE' 'N' 'NE' 'NNE' 'NNW' 'NW' 'S' 'SE' 'SSE' 'SSW' 'SW' 'W'
 'WNW' 'WSW']:N
Enter the value for WindSpeed:12
Enter the value for Humidity:17
Enter the value for Pressure:11
Enter value for RainToday['No' 'Yes']:Yes


Unnamed: 0,MinTemp,MaxTemp,WindDir,WindSpeed,Humidity,Pressure,RainToday
0,21.0,35.0,3.0,12.0,17.0,11.0,1.0


In [21]:

res=model.predict(datapoint)
if l3.inverse_transform(res)[0] =="No":
    print(" there will be No RainFall Tomorrow")
else:
    print(" there will be Rainfall Tomorrow")
print("Prediction probability:")
pd.DataFrame(model.predict_proba(datapoint),columns=["No","Yes"])

 there will be No RainFall Tomorrow
Prediction probability:


Unnamed: 0,No,Yes
0,0.67,0.33
