In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [2]:
data2 = pd.read_csv("./Rainfall Data in India.csv")

In [3]:
data2

Unnamed: 0,DISTRICT,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
0,SOUTH ANDAMAN,1950,0.8,0.0,0.0,95.5,264.2,369.9,366.7,347.3,494.5,224.6,89.3,169.4
1,SOUTH ANDAMAN,1951,82.7,7.2,0.0,45.4,259.0,619.9,665.3,101.3,360.9,489.0,209.6,434.8
2,SOUTH ANDAMAN,1952,0.0,0.8,69.7,39.4,452.9,657.7,385.5,541.3,240.3,315.6,287.5,89.2
3,SOUTH ANDAMAN,1953,27.0,78.4,0.0,133.4,261.7,481.0,561.3,308.2,399.6,299.5,262.0,65.7
4,SOUTH ANDAMAN,1954,37.4,0.5,35.3,56.9,451.8,611.8,599.7,817.0,1123.0,175.1,35.1,84.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32569,WEST MIDNAPORE,2006,0.0,0.0,12.8,56.9,113.9,152.4,417.6,446.8,333.7,40.9,1.0,0.0
32570,WEST MIDNAPORE,2007,2.1,61.2,102.8,49.7,47.1,287.6,766.6,370.5,458.9,12.4,11.5,0.0
32571,WEST MIDNAPORE,2008,72.8,6.3,20.6,102.4,110.6,793.1,314.2,263.6,338.9,48.5,0.0,0.0
32572,WEST MIDNAPORE,2009,0.0,0.0,29.6,5.7,381.6,54.1,223.6,279.8,276.4,58.5,4.3,0.0


In [7]:
data2['DISTRICT'].unique()

array(['SOUTH ANDAMAN', 'NICOBAR', 'NORTH ANDAMAN', 'ANANTAPUR',
       'CHITTOR', 'CUDDAPAH', 'EAST GODAVARI', 'GUNTUR', 'KRISHNA',
       'KURNOOL', 'NELLORE', 'PRAKASAM', 'SRIKAKULAM', 'VISAKHAPATNAM',
       'VIZIANAGARAM', 'WEST GODAVARI', 'EAST SIANG', 'LOHIT',
       'WEST KAMENG', 'BAKSA', 'BARPETA', 'CACHAR', 'DHUBRI', 'DIBRUGARH',
       'GOALPARA', 'GOLAGHAT', 'JORHAT', 'KAMRUP METROPOLI', 'KAMRUP',
       'KARBI ANGLONG', 'KOKRAJHAR', 'LAKHIMPUR', 'N.C. HILLS', 'NALBARI',
       'NOWGONG', 'SIBSAGAR', 'SONITPUR', 'TINSUKIA', 'UDALGURI',
       'ARARIA', 'ARWAL', 'BANKA', 'BEGUSARAI', 'BHABUA', 'BHAGALPUR',
       'BHOJPUR', 'BUXAR', 'CHAMPARAN.EAST', 'CHAMPARAN.WEST',
       'DARBHANGA', 'GAYA', 'GOPALGANJ', 'JAHANABAD', 'JAMUI', 'KATIHAR',
       'KHAGARIA', 'KISHANGANJ', 'MADHEPURA', 'MADHUBANI', 'MUNGER',
       'MUZAFFARPUR', 'NALANDA', 'NAWADAH', 'PATNA', 'PURNEA', 'ROHTAS',
       'SAHARSA', 'SAMASTIPUR', 'SARAN', 'SHEIKHPURA', 'SITAMARHI',
       'SIWAN', 'SUPAUL', '

In [4]:
data2.isnull().sum()

DISTRICT       0
YEAR           0
JAN         1951
FEB         2062
MAR         1928
APR         1830
MAY         1890
JUN         1926
JUL         1891
AUG         1947
SEP         1992
OCT         2043
NOV         2172
DEC         2080
dtype: int64

In [7]:
data2.fillna(data2.mean(), inplace=True)

  data2.fillna(data2.mean(), inplace=True)


In [8]:
label_encoder = LabelEncoder()
data2['DISTRICT'] = label_encoder.fit_transform(data2['DISTRICT'])

In [9]:
data_long = pd.melt(data2, id_vars=['DISTRICT', 'YEAR'], var_name='MONTH', value_name='RAINFALL')


In [10]:
months_map = {'JAN': 1, 'FEB': 2, 'MAR': 3, 'APR': 4, 'MAY': 5, 'JUN': 6,
              'JUL': 7, 'AUG': 8, 'SEP': 9, 'OCT': 10, 'NOV': 11, 'DEC': 12}
data_long['MONTH'] = data_long['MONTH'].map(months_map)


In [11]:
X = data_long[['DISTRICT', 'YEAR', 'MONTH']]
y = data_long['RAINFALL']


In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)


In [None]:
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


In [None]:
def predict_rainfall(district, year, month):
    district_encoded = label_encoder.transform([district])[0]
    features = [[district_encoded, year, month]]
    prediction = model.predict(features)
    return prediction[0]

# Example prediction
district = 'SOUTH ANDAMAN'
year = 2011
month = 5  # May
predicted_rainfall = predict_rainfall(district, year, month)
print(f"Predicted rainfall for {district} in {year} for month {month}: {predicted_rainfall} mm")
