# Scraping and Processing Rainfall Data for Climate Insights

## Import Packages URL

In [303]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import datetime
from datetime import date

In [304]:
url = 'https://beta-tnsmart.rimes.int/index.php/Rainfall/daily_data'

In [305]:
response = requests.get(url)

In [306]:
# get html code gather the all data 
if response.status_code == 200:
    print("The HTML file was imported successfully.")
    soup = BeautifulSoup(response.text, 'html')
else:
    print("An error occurred while importing the file.")
    soup = []

The HTML file was imported successfully.


In [307]:
soup

<!DOCTYPE html>
<html class="notranslate" translate="no">
<head>
<meta charset="utf-8"/>
<meta content="initial-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<link href="https://beta-tnsmart.rimes.int/images/Homepage2.png" rel="icon" type="image/x-icon"/>
<title>TN-SMART</title>
<link href="https://beta-tnsmart.rimes.int/assets/css/main.css" rel="stylesheet"/>
<!-- Tell the browser to be responsive to screen width -->
<meta content="width=device-width, initial-scale=1, maximum-scale=1, user-scalable=no" name="viewport"/>
<!-- Bootstrap 3.3.6 -->
<link href="https://beta-tnsmart.rimes.int/assets/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="https://beta-tnsmart.rimes.int/assets/css/loading-btn.css" rel="stylesheet"/>
<link href="https://beta-tnsmart.rimes.int/assets/css/loading.css" rel="stylesheet"/>
<link href="https://beta-tnsmart.rimes.int/assets/bootstrap/css/bootstrap-multiselect.css" rel="styl

## Find the date value in the HTML and check if it matches today's date to proceed further.

In [308]:
Da = soup.find(class_ = "panel-heading")
Da

<div class="panel-heading" style="background-color: #f5f5f5;color:black">District wise observed Rainfall
                    data on 30-Dec-2024</div>

In [309]:
Date = Da.text.strip().replace("District wise observed Rainfall\n                    data on ","")

In [310]:
Date

'30-Dec-2024'

In [311]:
Date[3:6]

'Dec'

In [312]:
# Assign month str to value
month = { "Jan" : "01", "Feb" : "02", "Mar" : "03", "Apr" : "04", "May" : "05", "Jun" : "06", "Jul" : "07",
          "Aug" : "08", "Sep" : "09", "Oct" : "10", "Nov" : "11", "Dec" : "12"}
month

{'Jan': '01',
 'Feb': '02',
 'Mar': '03',
 'Apr': '04',
 'May': '05',
 'Jun': '06',
 'Jul': '07',
 'Aug': '08',
 'Sep': '09',
 'Oct': '10',
 'Nov': '11',
 'Dec': '12'}

In [313]:
month[Date[3:6]]

'12'

In [314]:
# Replace the str to numeric value
Da = Date.replace(Date[3:6],month[Date[3:6]])
Da

'30-12-2024'

In [315]:
# Date Values
y = int(Da[6:11])
m = int(Da[3:5])
d = int(Da[0:2])

In [316]:
Today_Date =datetime.date(y,m,d)
Today_Date

datetime.date(2024, 12, 30)

In [317]:
date.today()

datetime.date(2024, 12, 30)

In [318]:
Today_Date == date.today()  

True

## Gather the table data from the HTML for further processing.

In [319]:
# Find the necessary data
if ((Today_Date == date.today())):
    print("Date match with current Date")
    table = soup.find('table', id = "data_table")
else:
    print("Date not matched")
    table = []

Date match with current Date


### Cheking the table title based on the provided data or context.

In [320]:
table_th = table.find_all('th')[:5]
table_th

[<th>வரிசை எண்</th>,
 <th>உரிமையாளர்</th>,
 <th>மாவட்டம்</th>,
 <th>மழைமானி நிலையம்</th>,
 <th>பதிவான மழை அளவு (மி.மீ.)</th>]

In [321]:
table_title = [table_th.text.strip('\n') for table_th in table_th]
table_title

['வரிசை எண்',
 'உரிமையாளர்',
 'மாவட்டம்',
 'மழைமானி நிலையம்',
 'பதிவான மழை அளவு (மி.மீ.)']

In [322]:
len(table_title)

5

In [323]:
df = pd.DataFrame(columns = table_title)

In [324]:
df

Unnamed: 0,வரிசை எண்,உரிமையாளர்,மாவட்டம்,மழைமானி நிலையம்,பதிவான மழை அளவு (மி.மீ.)


## Gather the table data from the HTML document for processing.

In [325]:
table

<table id="data_table">
<thead>
<tr>
<th>வரிசை எண்</th>
<th>உரிமையாளர்</th>
<th>மாவட்டம்</th>
<th>மழைமானி நிலையம்</th>
<th>பதிவான மழை அளவு (மி.மீ.)</th>
</tr>
</thead>
<tbody>
<tr>
<th>1</th>
<th>Revenue</th>
<th class="district_color">Ariyalur</th>
<th>Suthamalli dam</th>
<td class="color_code">
                            5</td>
</tr>
</tbody>
<tbody>
<tr>
<th>2</th>
<th>Revenue</th>
<th class="district_color">Ariyalur</th>
<th>Kuruvadi</th>
<td class="color_code">
                            4</td>
</tr>
</tbody>
<tbody>
<tr>
<th>3</th>
<th>Revenue</th>
<th class="district_color">Ariyalur</th>
<th>Jayankondam taluk office</th>
<td class="color_code">
                            4</td>
</tr>
</tbody>
<tbody>
<tr>
<th>4</th>
<th>Revenue</th>
<th class="district_color">Ariyalur</th>
<th>PWD Office, Sendurai</th>
<td class="color_code">
                            3.6</td>
</tr>
</tbody>
<tbody>
<tr>
<th>5</th>
<th>Revenue</th>
<th class="district_color">Ariyalur</th>
<th>Taluk Office, 

In [326]:
table_tr = table.find_all('tr')
table_tr

[<tr>
 <th>வரிசை எண்</th>
 <th>உரிமையாளர்</th>
 <th>மாவட்டம்</th>
 <th>மழைமானி நிலையம்</th>
 <th>பதிவான மழை அளவு (மி.மீ.)</th>
 </tr>,
 <tr>
 <th>1</th>
 <th>Revenue</th>
 <th class="district_color">Ariyalur</th>
 <th>Suthamalli dam</th>
 <td class="color_code">
                             5</td>
 </tr>,
 <tr>
 <th>2</th>
 <th>Revenue</th>
 <th class="district_color">Ariyalur</th>
 <th>Kuruvadi</th>
 <td class="color_code">
                             4</td>
 </tr>,
 <tr>
 <th>3</th>
 <th>Revenue</th>
 <th class="district_color">Ariyalur</th>
 <th>Jayankondam taluk office</th>
 <td class="color_code">
                             4</td>
 </tr>,
 <tr>
 <th>4</th>
 <th>Revenue</th>
 <th class="district_color">Ariyalur</th>
 <th>PWD Office, Sendurai</th>
 <td class="color_code">
                             3.6</td>
 </tr>,
 <tr>
 <th>5</th>
 <th>Revenue</th>
 <th class="district_color">Ariyalur</th>
 <th>Taluk Office, Andimadam</th>
 <td class="color_code">
                            

In [327]:
# Strip the value datas
list = [table_tr.text.strip().replace("\n\n", ",") for table_tr in table_tr][1:]
list

['1\nRevenue\nAriyalur\nSuthamalli dam,                            5',
 '2\nRevenue\nAriyalur\nKuruvadi,                            4',
 '3\nRevenue\nAriyalur\nJayankondam taluk office,                            4',
 '4\nRevenue\nAriyalur\nPWD Office, Sendurai,                            3.6',
 '5\nRevenue\nAriyalur\nTaluk Office, Andimadam,                            3',
 '6\nRevenue\nAriyalur\nAriyalur Taluk Office,                            2.5',
 '7\nRevenue\nAriyalur\nTHIRUMANUR  PWD OFFICE (R.C.DIVISION),                            2',
 '8\nRevenue\nAriyalur\nT.Palur Panchayat Union 0ffice,                            1',
 'Total rainfall\n25.1',
 'Average rainfall\n3.14',
 '1\nRevenue\nChengalpattu\nTaluk Office, Thirukalukundram,                            45',
 '2\nRevenue\nChengalpattu\nMammallapuram PWD bungalow,                            38',
 '3\nRevenue\nChengalpattu\nKeelampakkam,                            14.2',
 '4\nRevenue\nChengalpattu\nTaluk Office, Thiruporur,  

In [328]:
datalist = []
datalist

[]

In [329]:
list[0].strip().replace("\n",",")

'1,Revenue,Ariyalur,Suthamalli dam,                            5'

In [330]:
range(len(list))

range(0, 203)

In [331]:
for i in range(len(list)):
    datalist.append(list[i].strip().replace("\n",","))

In [332]:
# List of all data
datalist

['1,Revenue,Ariyalur,Suthamalli dam,                            5',
 '2,Revenue,Ariyalur,Kuruvadi,                            4',
 '3,Revenue,Ariyalur,Jayankondam taluk office,                            4',
 '4,Revenue,Ariyalur,PWD Office, Sendurai,                            3.6',
 '5,Revenue,Ariyalur,Taluk Office, Andimadam,                            3',
 '6,Revenue,Ariyalur,Ariyalur Taluk Office,                            2.5',
 '7,Revenue,Ariyalur,THIRUMANUR  PWD OFFICE (R.C.DIVISION),                            2',
 '8,Revenue,Ariyalur,T.Palur Panchayat Union 0ffice,                            1',
 'Total rainfall,25.1',
 'Average rainfall,3.14',
 '1,Revenue,Chengalpattu,Taluk Office, Thirukalukundram,                            45',
 '2,Revenue,Chengalpattu,Mammallapuram PWD bungalow,                            38',
 '3,Revenue,Chengalpattu,Keelampakkam,                            14.2',
 '4,Revenue,Chengalpattu,Taluk Office, Thiruporur,                            10',
 '5,Rev

In [333]:
#Make DataFrame for further data wrangling
df = pd.DataFrame(datalist, columns = ['A'])

In [334]:
df1 = pd.DataFrame(df)

In [335]:
df1

Unnamed: 0,A
0,"1,Revenue,Ariyalur,Suthamalli dam, ..."
1,"2,Revenue,Ariyalur,Kuruvadi, ..."
2,"3,Revenue,Ariyalur,Jayankondam taluk office, ..."
3,"4,Revenue,Ariyalur,PWD Office, Sendurai, ..."
4,"5,Revenue,Ariyalur,Taluk Office, Andimadam, ..."
...,...
198,"15,Revenue,Villupuram,RSCL-2 Kanjanur, ..."
199,"16,Revenue,Villupuram,RSCL-3 Valathy, ..."
200,"Total rainfall,63.4"
201,"Average rainfall,3.02"


In [336]:
# Split the Data into columns
df1 = df1.A.str.split(",",expand = True)
df1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,Revenue,Ariyalur,Suthamalli dam,5,,,,,
1,2,Revenue,Ariyalur,Kuruvadi,4,,,,,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,,,,,
3,4,Revenue,Ariyalur,PWD Office,Sendurai,3.6,,,,
4,5,Revenue,Ariyalur,Taluk Office,Andimadam,3,,,,
...,...,...,...,...,...,...,...,...,...,...
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,,,,,
199,16,Revenue,Villupuram,RSCL-3 Valathy,1,,,,,
200,Total rainfall,63.4,,,,,,,,
201,Average rainfall,3.02,,,,,,,,


In [337]:
df1.isna().sum()

0      0
1      0
2     42
3     42
4     42
5    192
6    202
7    202
8    202
9    202
dtype: int64

In [338]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       203 non-null    object
 1   1       203 non-null    object
 2   2       161 non-null    object
 3   3       161 non-null    object
 4   4       161 non-null    object
 5   5       11 non-null     object
 6   6       1 non-null      object
 7   7       1 non-null      object
 8   8       1 non-null      object
 9   9       1 non-null      object
dtypes: object(10)
memory usage: 16.0+ KB


In [339]:
df1 = df1.drop(columns = [6,7,8,9])
df1

Unnamed: 0,0,1,2,3,4,5
0,1,Revenue,Ariyalur,Suthamalli dam,5,
1,2,Revenue,Ariyalur,Kuruvadi,4,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,
3,4,Revenue,Ariyalur,PWD Office,Sendurai,3.6
4,5,Revenue,Ariyalur,Taluk Office,Andimadam,3
...,...,...,...,...,...,...
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,
199,16,Revenue,Villupuram,RSCL-3 Valathy,1,
200,Total rainfall,63.4,,,,
201,Average rainfall,3.02,,,,


In [340]:
# Unwanted rows
df1[df1[4].isna()]

Unnamed: 0,0,1,2,3,4,5
8,Total rainfall,25.1,,,,
9,Average rainfall,3.14,,,,
18,Total rainfall,120.2,,,,
19,Average rainfall,13.36,,,,
36,Total rainfall,51.7,,,,
37,Average rainfall,0.94,,,,
61,Total rainfall,140.2,,,,
62,Average rainfall,5.61,,,,
67,Total rainfall,3.0,,,,
68,Average rainfall,0.3,,,,


In [341]:
# Unwanted row list
delete_rows = df1[df1[4].isna()].index
delete_rows

Index([  8,   9,  18,  19,  36,  37,  61,  62,  67,  68,  71,  72,  74,  75,
        79,  80,  88,  89,  98,  99, 112, 113, 118, 119, 121, 122, 124, 125,
       145, 146, 152, 153, 162, 163, 174, 175, 178, 179, 182, 183, 200, 201],
      dtype='int64')

In [342]:
# The unwanted unwanted row add into the delete_rows list
delete_rows = delete_rows.append(df1[-1:].index)

In [343]:
delete_rows

Index([  8,   9,  18,  19,  36,  37,  61,  62,  67,  68,  71,  72,  74,  75,
        79,  80,  88,  89,  98,  99, 112, 113, 118, 119, 121, 122, 124, 125,
       145, 146, 152, 153, 162, 163, 174, 175, 178, 179, 182, 183, 200, 201,
       202],
      dtype='int64')

In [344]:
# delete the rows 
df1 = df1.drop(delete_rows,axis=0)

In [345]:
df1

Unnamed: 0,0,1,2,3,4,5
0,1,Revenue,Ariyalur,Suthamalli dam,5,
1,2,Revenue,Ariyalur,Kuruvadi,4,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,
3,4,Revenue,Ariyalur,PWD Office,Sendurai,3.6
4,5,Revenue,Ariyalur,Taluk Office,Andimadam,3
...,...,...,...,...,...,...
195,12,Revenue,Villupuram,RSCL-3 Avalurpettai,2,
196,13,Revenue,Villupuram,SCS MILL Arasoor,2,
197,14,Revenue,Villupuram,SCS MILL Thiruvennainallur,2,
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,


In [346]:
# Retrive text from colum no 4
df1[4.1] = df1[4].str.extract('([a-zA-Z]+)')

In [347]:
df1

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,4.1
0,1,Revenue,Ariyalur,Suthamalli dam,5,,
1,2,Revenue,Ariyalur,Kuruvadi,4,,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,,
3,4,Revenue,Ariyalur,PWD Office,Sendurai,3.6,Sendurai
4,5,Revenue,Ariyalur,Taluk Office,Andimadam,3,Andimadam
...,...,...,...,...,...,...,...
195,12,Revenue,Villupuram,RSCL-3 Avalurpettai,2,,
196,13,Revenue,Villupuram,SCS MILL Arasoor,2,,
197,14,Revenue,Villupuram,SCS MILL Thiruvennainallur,2,,
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,,


#### Combine columns 4.0 and 4.1 to create a valid location. using loc method

In [348]:
df1[4.1].dropna()

3               Sendurai
4              Andimadam
10      Thirukalukundram
13            Thiruporur
16              Tambaram
139           Kumbakonam
156            Tiruvarur
158             Nannilam
160    Thiruthuraipoondi
161          Muthupettai
Name: 4.1, dtype: object

In [349]:
df1[4.1].dropna().index

Index([3, 4, 10, 13, 16, 139, 156, 158, 160, 161], dtype='int64')

In [350]:
df1[4.1].dropna().values

array(['Sendurai', 'Andimadam', 'Thirukalukundram', 'Thiruporur',
       'Tambaram', 'Kumbakonam', 'Tiruvarur', 'Nannilam',
       'Thiruthuraipoondi', 'Muthupettai'], dtype=object)

In [351]:
df1[3].loc[df1[4.1].dropna().index] = df1[3].loc[df1[4.1].dropna().index] + "," + df1[4.1].dropna().values

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df1[3].loc[df1[4.1].dropna().index] = df1[3].loc[df1[4.1].dropna().index] + "," + df1[4.1].dropna().values


In [352]:
df1

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,4.1
0,1,Revenue,Ariyalur,Suthamalli dam,5,,
1,2,Revenue,Ariyalur,Kuruvadi,4,,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,,
3,4,Revenue,Ariyalur,"PWD Office,Sendurai",Sendurai,3.6,Sendurai
4,5,Revenue,Ariyalur,"Taluk Office,Andimadam",Andimadam,3,Andimadam
...,...,...,...,...,...,...,...
195,12,Revenue,Villupuram,RSCL-3 Avalurpettai,2,,
196,13,Revenue,Villupuram,SCS MILL Arasoor,2,,
197,14,Revenue,Villupuram,SCS MILL Thiruvennainallur,2,,
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,,


#### Combine the floating point values from columns 4.0 and 5.0 into a single column, ensuring that non-numeric data is properly handled.

In [353]:
df1[4].loc[df1[4.1].dropna().index]

3                Sendurai
4               Andimadam
10       Thirukalukundram
13             Thiruporur
16               Tambaram
139            Kumbakonam
156             Tiruvarur
158              Nannilam
160     Thiruthuraipoondi
161           Muthupettai
Name: 4, dtype: object

In [354]:
df1[5].loc[df1[4.1].dropna().index]

3                                   3.6
4                                     3
10                                   45
13                                   10
16                                    2
139                                 1.2
156                                  31
158                                14.2
160                                10.4
161                                 3.2
Name: 5.0, dtype: object

In [355]:
df1[4].loc[df1[4.1].dropna().index] = df1[5].loc[df1[4.1].dropna().index]

In [356]:
df1

Unnamed: 0,0.0,1.0,2.0,3.0,4.0,5.0,4.1
0,1,Revenue,Ariyalur,Suthamalli dam,5,,
1,2,Revenue,Ariyalur,Kuruvadi,4,,
2,3,Revenue,Ariyalur,Jayankondam taluk office,4,,
3,4,Revenue,Ariyalur,"PWD Office,Sendurai",3.6,3.6,Sendurai
4,5,Revenue,Ariyalur,"Taluk Office,Andimadam",3,3,Andimadam
...,...,...,...,...,...,...,...
195,12,Revenue,Villupuram,RSCL-3 Avalurpettai,2,,
196,13,Revenue,Villupuram,SCS MILL Arasoor,2,,
197,14,Revenue,Villupuram,SCS MILL Thiruvennainallur,2,,
198,15,Revenue,Villupuram,RSCL-2 Kanjanur,1,,


In [357]:
# Drop the unwanted columns to clean up the dataset.
df1 = df1.drop(columns = [0.0,5.0,4.1])

In [358]:
df1

Unnamed: 0,1.0,2.0,3.0,4.0
0,Revenue,Ariyalur,Suthamalli dam,5
1,Revenue,Ariyalur,Kuruvadi,4
2,Revenue,Ariyalur,Jayankondam taluk office,4
3,Revenue,Ariyalur,"PWD Office,Sendurai",3.6
4,Revenue,Ariyalur,"Taluk Office,Andimadam",3
...,...,...,...,...
195,Revenue,Villupuram,RSCL-3 Avalurpettai,2
196,Revenue,Villupuram,SCS MILL Arasoor,2
197,Revenue,Villupuram,SCS MILL Thiruvennainallur,2
198,Revenue,Villupuram,RSCL-2 Kanjanur,1


In [359]:
df1.columns

Index([1.0, 2.0, 3.0, 4.0], dtype='float64')

In [360]:
table_title

['வரிசை எண்',
 'உரிமையாளர்',
 'மாவட்டம்',
 'மழைமானி நிலையம்',
 'பதிவான மழை அளவு (மி.மீ.)']

In [361]:
# Assign appropriate column titles to the dataset for better clarity and organization.
column_Name = ['dept', 'dist', 'station', 'value']

In [362]:
df1.columns = column_Name

In [363]:
df1

Unnamed: 0,dept,dist,station,value
0,Revenue,Ariyalur,Suthamalli dam,5
1,Revenue,Ariyalur,Kuruvadi,4
2,Revenue,Ariyalur,Jayankondam taluk office,4
3,Revenue,Ariyalur,"PWD Office,Sendurai",3.6
4,Revenue,Ariyalur,"Taluk Office,Andimadam",3
...,...,...,...,...
195,Revenue,Villupuram,RSCL-3 Avalurpettai,2
196,Revenue,Villupuram,SCS MILL Arasoor,2
197,Revenue,Villupuram,SCS MILL Thiruvennainallur,2
198,Revenue,Villupuram,RSCL-2 Kanjanur,1


In [364]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160 entries, 0 to 199
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   dept     160 non-null    object
 1   dist     160 non-null    object
 2   station  160 non-null    object
 3   value    160 non-null    object
dtypes: object(4)
memory usage: 10.3+ KB


In [365]:
df1.value = df1.value.astype(float)

In [366]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160 entries, 0 to 199
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   dept     160 non-null    object 
 1   dist     160 non-null    object 
 2   station  160 non-null    object 
 3   value    160 non-null    float64
dtypes: float64(1), object(3)
memory usage: 10.3+ KB


In [367]:
# Reset the index and drop the old index to ensure a clean and sequential dataset.
df1 = df1.reset_index()

In [368]:
df1 = df1.drop(columns = 'index')

### Form a DataFrame using the current date as the data for processing.

In [369]:
len(df1)

160

In [370]:
# create DF to feed into the rainfall data 
DL = [Da] 
for i in range (len(df1)):
    DL.append(DL[0])

In [371]:
DL

['30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-2024',
 '30-12-20

In [372]:
C_Date = pd.DataFrame(DL,columns= ['date'])

In [373]:
C_Date.date

0      30-12-2024
1      30-12-2024
2      30-12-2024
3      30-12-2024
4      30-12-2024
          ...    
156    30-12-2024
157    30-12-2024
158    30-12-2024
159    30-12-2024
160    30-12-2024
Name: date, Length: 161, dtype: object

In [374]:
# Change the data type to datetime format for proper date handling 
C_Date.date = C_Date.date.astype('datetime64[ns]')

In [375]:
C_Date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 161 entries, 0 to 160
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    161 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 1.4 KB


In [376]:
# Join both DataFrames to merge their data for comprehensive analysis
Today_Rain_Fall = df1.join(C_Date)

In [377]:
Today_Rain_Fall.columns

Index(['dept', 'dist', 'station', 'value', 'date'], dtype='object')

In [378]:
# The final cleaned and processed dataset is ready for analysis or further use.
Today_Rain_Fall

Unnamed: 0,dept,dist,station,value,date
0,Revenue,Ariyalur,Suthamalli dam,5.0,2024-12-30
1,Revenue,Ariyalur,Kuruvadi,4.0,2024-12-30
2,Revenue,Ariyalur,Jayankondam taluk office,4.0,2024-12-30
3,Revenue,Ariyalur,"PWD Office,Sendurai",3.6,2024-12-30
4,Revenue,Ariyalur,"Taluk Office,Andimadam",3.0,2024-12-30
...,...,...,...,...,...
155,Revenue,Villupuram,RSCL-3 Avalurpettai,2.0,2024-12-30
156,Revenue,Villupuram,SCS MILL Arasoor,2.0,2024-12-30
157,Revenue,Villupuram,SCS MILL Thiruvennainallur,2.0,2024-12-30
158,Revenue,Villupuram,RSCL-2 Kanjanur,1.0,2024-12-30


In [379]:
Today_Rain_Fall.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   dept     160 non-null    object        
 1   dist     160 non-null    object        
 2   station  160 non-null    object        
 3   value    160 non-null    float64       
 4   date     160 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 6.4+ KB


# Fusing Historical and Updated Rainfall Data for Advanced Analysis

In [380]:
# Read File
TN_Rain_Fall_History = pd.read_csv(r"C:\Users\Arunprakash Babu\OneDrive\ドキュメント\GitHub\Portfolios\Projects\Pb5_Rainfall data analysis of TN\Total_Rain_Fall_Data.csv", index_col = 0)

In [381]:
TN_Rain_Fall_History

Unnamed: 0_level_0,dept,dist,station,value,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Revenue,The Nilgiris,Devala,5.0,1990-02-13
1,Revenue,Coimbatore,"Taluk Office, Pollachi",8.0,1990-10-17
2,Revenue,Coimbatore,"Taluk Office, Pollachi",34.0,1990-10-18
3,Revenue,Mayiladuthurai,Anaikaranchatram (Kollidam),13.0,1990-10-18
4,Revenue,Mayiladuthurai,Sirkali,12.4,1990-10-18
...,...,...,...,...,...
651039,Revenue,Villupuram,RSCL-3 Avalurpettai,2.0,2024-12-30
651040,Revenue,Villupuram,SCS MILL Arasoor,2.0,2024-12-30
651041,Revenue,Villupuram,SCS MILL Thiruvennainallur,2.0,2024-12-30
651042,Revenue,Villupuram,RSCL-2 Kanjanur,1.0,2024-12-30


In [382]:
TN_Rain_Fall_History.info()

<class 'pandas.core.frame.DataFrame'>
Index: 651044 entries, 0 to 651043
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   dept     651044 non-null  object 
 1   dist     651044 non-null  object 
 2   station  651044 non-null  object 
 3   value    651044 non-null  float64
 4   date     651044 non-null  object 
dtypes: float64(1), object(4)
memory usage: 29.8+ MB


## Change datatype of date colum

In [383]:
TN_Rain_Fall_History.date = pd.to_datetime(TN_Rain_Fall_History.date, dayfirst = True)

  TN_Rain_Fall_History.date = pd.to_datetime(TN_Rain_Fall_History.date, dayfirst = True)


In [384]:
TN_Rain_Fall_History.info()

<class 'pandas.core.frame.DataFrame'>
Index: 651044 entries, 0 to 651043
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   dept     651044 non-null  object        
 1   dist     651044 non-null  object        
 2   station  651044 non-null  object        
 3   value    651044 non-null  float64       
 4   date     651044 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 29.8+ MB


In [385]:
Today_Rain_Fall.loc[1]

dept                   Revenue
dist                  Ariyalur
station               Kuruvadi
value                      4.0
date       2024-12-30 00:00:00
Name: 1, dtype: object

In [386]:
TN_Rain_Fall_History.loc[len(TN_Rain_Fall_History)-1][4]

  TN_Rain_Fall_History.loc[len(TN_Rain_Fall_History)-1][4]


Timestamp('2024-12-30 00:00:00')

In [387]:
Today_Rain_Fall.loc[0][4]

  Today_Rain_Fall.loc[0][4]


Timestamp('2024-12-30 00:00:00')

## Fusing Two DataFrames for Comprehensive Data Processing

In [388]:
if TN_Rain_Fall_History.loc[len(TN_Rain_Fall_History)-1][4] != Today_Rain_Fall.loc[0][4] :
    Total_Rain_Fall_Data = pd.concat([TN_Rain_Fall_History,Today_Rain_Fall])
    print("Data Added")
else :
    Total_Rain_Fall_Data = TN_Rain_Fall_History
    print("No Change")

No Change


  if TN_Rain_Fall_History.loc[len(TN_Rain_Fall_History)-1][4] != Today_Rain_Fall.loc[0][4] :


In [389]:
Total_Rain_Fall_Data

Unnamed: 0_level_0,dept,dist,station,value,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Revenue,The Nilgiris,Devala,5.0,1990-02-13
1,Revenue,Coimbatore,"Taluk Office, Pollachi",8.0,1990-10-17
2,Revenue,Coimbatore,"Taluk Office, Pollachi",34.0,1990-10-18
3,Revenue,Mayiladuthurai,Anaikaranchatram (Kollidam),13.0,1990-10-18
4,Revenue,Mayiladuthurai,Sirkali,12.4,1990-10-18
...,...,...,...,...,...
651039,Revenue,Villupuram,RSCL-3 Avalurpettai,2.0,2024-12-30
651040,Revenue,Villupuram,SCS MILL Arasoor,2.0,2024-12-30
651041,Revenue,Villupuram,SCS MILL Thiruvennainallur,2.0,2024-12-30
651042,Revenue,Villupuram,RSCL-2 Kanjanur,1.0,2024-12-30


In [390]:
# Checking row counts modified
print(len(TN_Rain_Fall_History))
print(len(Today_Rain_Fall))
print(len(TN_Rain_Fall_History)+len(Today_Rain_Fall))
print(len(Total_Rain_Fall_Data.index))
(len(TN_Rain_Fall_History)+len(Today_Rain_Fall)) == len(Total_Rain_Fall_Data.index)

651044
160
651204
651044


False

In [391]:
# Reset Index value
Total_Rain_Fall_Data = Total_Rain_Fall_Data.reset_index(drop = True)

In [392]:
Total_Rain_Fall_Data.index.name = 'id'

In [393]:
Total_Rain_Fall_Data

Unnamed: 0_level_0,dept,dist,station,value,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Revenue,The Nilgiris,Devala,5.0,1990-02-13
1,Revenue,Coimbatore,"Taluk Office, Pollachi",8.0,1990-10-17
2,Revenue,Coimbatore,"Taluk Office, Pollachi",34.0,1990-10-18
3,Revenue,Mayiladuthurai,Anaikaranchatram (Kollidam),13.0,1990-10-18
4,Revenue,Mayiladuthurai,Sirkali,12.4,1990-10-18
...,...,...,...,...,...
651039,Revenue,Villupuram,RSCL-3 Avalurpettai,2.0,2024-12-30
651040,Revenue,Villupuram,SCS MILL Arasoor,2.0,2024-12-30
651041,Revenue,Villupuram,SCS MILL Thiruvennainallur,2.0,2024-12-30
651042,Revenue,Villupuram,RSCL-2 Kanjanur,1.0,2024-12-30


In [394]:
# Export all data
Total_Rain_Fall_Data.to_csv(r"C:\Users\Arunprakash Babu\OneDrive\ドキュメント\GitHub\Portfolios\Projects\Pb5_Rainfall data analysis of TN\Total_Rain_Fall_Data.csv")