## Cleaning

In [1]:
import pandas as pd
df = pd.read_csv("bangalore.csv", skiprows=9)

In [2]:
df.head()

Unnamed: 0,YEAR,DOY,T2M_MAX
0,2002,121,39.88
1,2002,122,38.06
2,2002,123,37.77
3,2002,124,35.57
4,2002,125,38.87


In [3]:
df.dropna(inplace=True)
df[df.isnull().any(axis=1)]
df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + df['DOY'].astype(str), format='%Y%j').dt.strftime('%d-%m-%Y')

In [4]:
df.drop(df.columns[[1]], axis=1)
df = df[['DATE', 'T2M_MAX', 'YEAR']]

In [5]:
df['HEATWAVE'] = df['T2M_MAX'] > 40
df['CITY'] = 'Bengaluru'

In [6]:
df.head()

Unnamed: 0,DATE,T2M_MAX,YEAR,HEATWAVE,CITY
0,01-05-2002,39.88,2002,False,Bengaluru
1,02-05-2002,38.06,2002,False,Bengaluru
2,03-05-2002,37.77,2002,False,Bengaluru
3,04-05-2002,35.57,2002,False,Bengaluru
4,05-05-2002,38.87,2002,False,Bengaluru


In [7]:
df.to_csv("cleaned_bangalore.csv", index=False)

## Population

In [20]:
import numpy as np
import pandas as pd
# 2001 and 2011 population
years = [2001, 2011]
population = [6537124, 9621551]

# interpolate for 2002 to 2010
interp_years = np.arange(2001, 2012)
interp_population = np.interp(interp_years, years, population)

# create dataframe 
df_pop = pd.DataFrame({
    'YEAR': interp_years,
    'POPULATION': interp_population.astype(int)
})
print(df_pop)

    YEAR  POPULATION
0   2001     6537124
1   2002     6845566
2   2003     7154009
3   2004     7462452
4   2005     7770894
5   2006     8079337
6   2007     8387780
7   2008     8696222
8   2009     9004665
9   2010     9313108
10  2011     9621551


In [18]:
growth_rate = ((9621551/6537124)**(1/10))-1

In [21]:
df_pop = df_pop[df_pop['YEAR'] < 2012].copy()

In [22]:
for year in range(2012, 2023):
    last_pop = df_pop.loc[df_pop['YEAR'] == year - 1, 'POPULATION'].values[0]
    new_pop = last_pop * (1 + growth_rate)
    df_pop.loc[len(df_pop)] = [year, int(new_pop)]
df_pop = df_pop.sort_values('YEAR').reset_index(drop=True)


In [23]:
print(df_pop.tail(15))

    YEAR  POPULATION
7   2008     8696222
8   2009     9004665
9   2010     9313108
10  2011     9621551
11  2012    10000712
12  2013    10394814
13  2014    10804447
14  2015    11230222
15  2016    11672776
16  2017    12132770
17  2018    12610891
18  2019    13107854
19  2020    13624401
20  2021    14161304
21  2022    14719365


In [24]:
df_pop.to_csv("population_bangalore.csv", index=False)

## Merging

In [25]:
import pandas as pd
df1 = pd.read_csv("cleaned_bangalore.csv")
df2 = pd.read_csv("population_bangalore.csv")
# merging
merged_df = pd.merge(df1, df2, on='YEAR', how='left')

In [26]:
merged_df.to_csv("merged_bangalore.csv", index=False)