In [52]:
import numpy as np

In [53]:
from sklearn.utils import resample
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [54]:
rain_data = pd.read_csv('rainfall.csv')

In [55]:
rain_data.head()

Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,2,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,3,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,4,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,5,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7


In [56]:
# number of rows and columns in the dataset
rain_data.shape

(366, 12)

In [57]:
# remove extra  spaces in all columns
rain_data.columns = rain_data.columns.str.strip()

In [58]:
rain_data.columns

Index(['day', 'pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint',
       'humidity', 'cloud', 'rainfall', 'sunshine', 'winddirection',
       'windspeed'],
      dtype='object')

In [59]:
print("Data Info:")
rain_data.info()

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            366 non-null    int64  
 1   pressure       366 non-null    float64
 2   maxtemp        366 non-null    float64
 3   temparature    366 non-null    float64
 4   mintemp        366 non-null    float64
 5   dewpoint       366 non-null    float64
 6   humidity       366 non-null    int64  
 7   cloud          366 non-null    int64  
 8   rainfall       366 non-null    object 
 9   sunshine       366 non-null    float64
 10  winddirection  365 non-null    float64
 11  windspeed      365 non-null    float64
dtypes: float64(8), int64(3), object(1)
memory usage: 34.4+ KB


In [60]:
rain_data = rain_data.drop(columns=["day"])

In [61]:
rain_data.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1025.9,19.9,18.3,16.8,13.1,72,49,yes,9.3,80.0,26.3
1,1022.0,21.7,18.9,17.2,15.6,81,83,yes,0.6,50.0,15.3
2,1019.7,20.3,19.3,18.0,18.4,95,91,yes,0.0,40.0,14.2
3,1018.9,22.3,20.6,19.1,18.8,90,88,yes,1.0,50.0,16.9
4,1015.9,21.3,20.7,20.2,19.9,95,81,yes,0.0,40.0,13.7


In [62]:
# checking the number of missing values
print(rain_data.isnull().sum())

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    1
windspeed        1
dtype: int64


In [63]:
rain_data["winddirection"].unique()

array([ 80.,  50.,  40.,  20.,  30.,  60.,  70.,  10., 200., 220., 120.,
       190., 210., 300., 240., 180., 230.,  90., 170., 150., 100., 130.,
        nan, 160., 270., 280., 250., 260., 290., 350., 110., 140.])

In [64]:
# handle missing values
rain_data["winddirection"] = rain_data["winddirection"].fillna(rain_data["winddirection"].mode()[0])
rain_data["windspeed"] = rain_data["windspeed"].fillna(rain_data["windspeed"].median())

In [65]:
# checking the number of missing values
print(rain_data.isnull().sum())

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    0
windspeed        0
dtype: int64


In [66]:
rain_data["rainfall"].unique()

array(['yes', 'no'], dtype=object)

In [67]:
rain_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pressure       366 non-null    float64
 1   maxtemp        366 non-null    float64
 2   temparature    366 non-null    float64
 3   mintemp        366 non-null    float64
 4   dewpoint       366 non-null    float64
 5   humidity       366 non-null    int64  
 6   cloud          366 non-null    int64  
 7   rainfall       366 non-null    object 
 8   sunshine       366 non-null    float64
 9   winddirection  366 non-null    float64
 10  windspeed      366 non-null    float64
dtypes: float64(8), int64(2), object(1)
memory usage: 31.6+ KB


In [68]:
rain_data.isnull().sum()

pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
rainfall         0
sunshine         0
winddirection    0
windspeed        0
dtype: int64

In [69]:
rain_data.describe()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
count,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0,366.0
mean,1013.742623,26.191257,23.747268,21.894536,19.989071,80.177596,71.128415,4.419399,101.284153,21.534153
std,6.414776,5.978343,5.632813,5.594153,5.997021,10.06247,21.798012,3.934398,81.722827,10.056054
min,998.5,7.1,4.9,3.1,-0.4,36.0,0.0,0.0,10.0,4.4
25%,1008.5,21.2,18.825,17.125,16.125,75.0,58.0,0.5,40.0,13.725
50%,1013.0,27.75,25.45,23.7,21.95,80.5,80.0,3.5,70.0,20.5
75%,1018.1,31.2,28.6,26.575,25.0,87.0,88.0,8.2,190.0,27.825
max,1034.6,36.3,32.4,30.0,26.7,98.0,100.0,12.1,350.0,59.5


In [70]:
# converting the yes & no to 1 and 0 respectively
rain_data["rainfall"] = rain_data["rainfall"].map({"yes": 1, "no": 0})

In [71]:
rain_data.head()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1025.9,19.9,18.3,16.8,13.1,72,49,1,9.3,80.0,26.3
1,1022.0,21.7,18.9,17.2,15.6,81,83,1,0.6,50.0,15.3
2,1019.7,20.3,19.3,18.0,18.4,95,91,1,0.0,40.0,14.2
3,1018.9,22.3,20.6,19.1,18.8,90,88,1,1.0,50.0,16.9
4,1015.9,21.3,20.7,20.2,19.9,95,81,1,0.0,40.0,13.7


In [72]:
print(rain_data.columns)

Index(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity',
       'cloud', 'rainfall', 'sunshine', 'winddirection', 'windspeed'],
      dtype='object')


In [73]:
# drop highly correlated column
rain_data = rain_data.drop(columns=['maxtemp', 'temparature', 'mintemp'])

In [74]:
rain_data.head()

Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1025.9,13.1,72,49,1,9.3,80.0,26.3
1,1022.0,15.6,81,83,1,0.6,50.0,15.3
2,1019.7,18.4,95,91,1,0.0,40.0,14.2
3,1018.9,18.8,90,88,1,1.0,50.0,16.9
4,1015.9,19.9,95,81,1,0.0,40.0,13.7


In [75]:
print(rain_data["rainfall"].value_counts())

rainfall
1    249
0    117
Name: count, dtype: int64


In [76]:
# separate majority and minority class
df_majority = rain_data[rain_data["rainfall"] == 1]
df_minority = rain_data[rain_data["rainfall"] == 0]

In [77]:
print(df_majority.shape)
print(df_minority.shape)

(249, 8)
(117, 8)


In [78]:
# downsample majority class to match minority count
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

In [79]:
df_majority_downsampled.shape

(117, 8)

In [80]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [81]:
df_downsampled.shape

(234, 8)

In [82]:
df_downsampled.head()

Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
188,1005.9,25.6,77,53,1,10.5,270.0,11.3
9,1017.5,15.5,85,91,1,0.0,70.0,37.7
137,1012.3,20.1,80,86,1,0.3,80.0,39.5
89,1018.3,16.3,79,89,1,2.4,40.0,14.8
157,1008.8,24.7,91,80,1,2.2,20.0,11.2


In [83]:
# shuffle the final dataframe
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [84]:
df_downsampled.head()

Unnamed: 0,pressure,dewpoint,humidity,cloud,rainfall,sunshine,winddirection,windspeed
0,1022.2,14.1,78,90,1,0.0,30.0,28.5
1,1013.4,19.5,69,17,0,10.5,70.0,12.4
2,1006.1,24.4,74,27,0,10.8,220.0,8.7
3,1007.6,24.8,85,84,1,1.8,70.0,34.8
4,1021.2,8.4,66,18,0,10.1,20.0,24.4


In [85]:
df_downsampled["rainfall"].value_counts()

rainfall
1    117
0    117
Name: count, dtype: int64

In [86]:
# split features and target as X and y
X = df_downsampled.drop(columns=["rainfall"])
Y = df_downsampled["rainfall"]

In [87]:
print(X)

     pressure  dewpoint  humidity  cloud  sunshine  winddirection  windspeed
0      1022.2      14.1        78     90       0.0           30.0       28.5
1      1013.4      19.5        69     17      10.5           70.0       12.4
2      1006.1      24.4        74     27      10.8          220.0        8.7
3      1007.6      24.8        85     84       1.8           70.0       34.8
4      1021.2       8.4        66     18      10.1           20.0       24.4
..        ...       ...       ...    ...       ...            ...        ...
229    1008.1      25.4        86     75       5.7           20.0        9.5
230    1010.1      19.9        91     89       0.0           70.0       31.8
231    1020.6      14.7        91     88       0.3           50.0       24.4
232    1008.3      24.1        74     29       5.7           10.0        4.4
233    1005.0      26.1        87     82       2.2          160.0       12.6

[234 rows x 7 columns]


In [88]:
print(Y)

0      1
1      0
2      0
3      1
4      0
      ..
229    1
230    1
231    1
232    0
233    1
Name: rainfall, Length: 234, dtype: int64


In [89]:
# Splitting data into trainning and test data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [90]:
# Drop rows where any element is NaN
X_train_clean = X_train.dropna()
Y_train_clean = Y_train[X_train_clean.index]


In [91]:
print(X.shape, X_train.shape, X_test.shape)

(234, 7) (187, 7) (47, 7)


In [92]:
# Modeling training- Logistic Regression model
model= LogisticRegression()


In [93]:
# training logisticeregression model with Training data
model.fit(X_train, Y_train)

In [94]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [95]:
print('Accuracy on Training data:', training_data_accuracy)

Accuracy on Training data: 0.8021390374331551


In [96]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [97]:
print('Accuracy on Test data:', test_data_accuracy)

Accuracy on Test data: 0.8085106382978723


Saving the trained model

In [98]:
input_data = (1015.9, 19.9, 95, 81, 0.0, 40.0, 11.5)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
    print('No Rainfall')
else:
    print('Rainfall')

[1]
Rainfall




In [99]:
import pickle

In [100]:
filename = 'trained_model.sav'
pickle.dump(model,open(filename,'wb'))

loading the saved model

In [101]:
loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [102]:
input_data = (1015.9, 19.9, 95, 81, 0.0, 40.0, 11.5)
input_data_as_numpy_array= np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
    print('No Rainfall')
else:
    print('Rainfall')

[1]
Rainfall


