## Cleaning

In [1]:
import pandas as pd
df = pd.read_csv("delhi.csv", skiprows=9)

In [2]:
df.head()

Unnamed: 0,YEAR,DOY,T2M_MAX
0,2002,121,42.79
1,2002,122,43.63
2,2002,123,44.57
3,2002,124,45.19
4,2002,125,45.76


In [3]:
df.dropna(inplace=True)
df[df.isnull().any(axis=1)]
df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + df['DOY'].astype(str), format='%Y%j').dt.strftime('%d-%m-%Y')


In [4]:
df.head()

Unnamed: 0,YEAR,DOY,T2M_MAX,DATE
0,2002,121,42.79,01-05-2002
1,2002,122,43.63,02-05-2002
2,2002,123,44.57,03-05-2002
3,2002,124,45.19,04-05-2002
4,2002,125,45.76,05-05-2002


In [5]:
df.drop(df.columns[[1]], axis=1)
df = df[['DATE', 'T2M_MAX', 'YEAR']]

In [6]:
df.head()

Unnamed: 0,DATE,T2M_MAX,YEAR
0,01-05-2002,42.79,2002
1,02-05-2002,43.63,2002
2,03-05-2002,44.57,2002
3,04-05-2002,45.19,2002
4,05-05-2002,45.76,2002


In [8]:
df['HEATWAVE'] = df['T2M_MAX'] > 40
df['CITY'] = 'Delhi'

In [9]:
df.head()

Unnamed: 0,DATE,T2M_MAX,YEAR,HEATWAVE,CITY
0,01-05-2002,42.79,2002,True,Delhi
1,02-05-2002,43.63,2002,True,Delhi
2,03-05-2002,44.57,2002,True,Delhi
3,04-05-2002,45.19,2002,True,Delhi
4,05-05-2002,45.76,2002,True,Delhi


In [10]:
df.to_csv("cleaned_delhi.csv", index=False)

## Population

In [1]:
import numpy as np
import pandas as pd
# 2001 and 2011 population
years = [2001, 2011]
population = [13204122, 16787941]

# interpolate for 2002 to 2010
interp_years = np.arange(2001, 2012)
interp_population = np.interp(interp_years, years, population)

# create dataframe 
df_pop = pd.DataFrame({
    'YEAR': interp_years,
    'POPULATION': interp_population.astype(int)
})
print(df_pop)

    YEAR  POPULATION
0   2001    13204122
1   2002    13562503
2   2003    13920885
3   2004    14279267
4   2005    14637649
5   2006    14996031
6   2007    15354413
7   2008    15712795
8   2009    16071177
9   2010    16429559
10  2011    16787941


In [7]:
# extrapoliate

In [13]:
growth_rate = ((16787941/13204122)**(1/10))-1


In [14]:
df_pop = df_pop[df_pop['YEAR'] < 2012].copy()

In [15]:
for year in range(2012, 2023):
    last_pop = df_pop.loc[df_pop['YEAR'] == year - 1, 'POPULATION'].values[0]
    new_pop = last_pop * (1 + growth_rate)
    df_pop.loc[len(df_pop)] = [year, int(new_pop)]
df_pop = df_pop.sort_values('YEAR').reset_index(drop=True)

In [16]:
df_pop.to_csv("population_delhi.csv", index=False)

## Merging

In [17]:
import pandas as pd
df1 = pd.read_csv("cleaned_delhi.csv")
df2 = pd.read_csv("population_delhi.csv")
# merging
merged_df = pd.merge(df1, df2, on='YEAR', how='left')

In [18]:
merged_df.to_csv("merged_delhi.csv", index=False)