# 1. Introduction

**The initial dataset I used in this notebook is from <a href='https://opendatanepal.com/dataset/preliminary-data-of-national-population-and-housing-census-2021'>'opendatanepal.com'</a>. It contains Preliminary Data of National Population and Housing Census 2021. Scroll below to see more about the initial dataset.**

**Using web scrapping, I added some columns to the initial dataset for analyzing purpose.**

# 2. Initial dataset

In [None]:
import sqlite3

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import requests

import urllib.request

import re
from bs4 import BeautifulSoup

from requests.exceptions import Timeout
from IPython.display import clear_output

In [None]:
# Downloading the initial dataset

import urllib.request

link_to_initial_dataset = 'https://opendatanepal.com/dataset/6368a9aa-4649-46e6-925a-ebbff0c49fc1/resource/b11d363a-91c3-4332-ba14-ee50b3a12ec4/download/preliminary-data-of-national-population-and-housing-census-2021-english.csv'

filename = "preliminary-data-of-national-population-and-housing-census-2021-english.csv"

urllib.request.urlretrieve(link_to_initial_dataset, filename)

In [None]:
census_2021_df = pd.read_csv('/kaggle/working/preliminary-data-of-national-population-and-housing-census-2021-english.csv')
census_2021_df = census_2021_df.sort_values(by=['District', 'Local Level Name'], ascending=True)

In [None]:
# Reset the index
census_2021_df = census_2021_df.reset_index(drop=True)

In [None]:
census_2021_df.head()

**I'll add a new column in 3rd column position to dictate what the type of local government body each district: "Nagarpalika or Gaupalika"**

In [None]:
local_government_type = []
for u in census_2021_df['Local Level Name']:
    if 'Rural' in u or 'rural' in u:
        local_government_type.append('Nagarpalika (Municipality)')
    else:
        local_government_type.append('Gaupalika (Rural Municipality)')

# inserting the new column into the 3rd column position
census_2021_df.insert(2, 'local_government_type', local_government_type)

In [None]:
# Renaming column names
census_2021_df = census_2021_df.rename(columns={
    'District':'district',
    'Local Level Name': 'local_government_name',
    'Total family number': 'total_family_number',
    'Total household number': 'total_household_number',
    'Total population': 'total_population_number',
    'Total Male': 'total_male_number',
    'Total Female': 'total_female_number'
})

In [None]:
census_2021_df

In [None]:
census_2021_df.info()

**The data looks good for analyzing as it is but it can benefit from including several other data like district_latitude, district_longitude.**

# 3. Scrapping District's Provinces & Coordinates

**Despite wikipedia being considered as not a reliable source, I'm pretty sure that the coordinates, Zones & Provinces data are accurate. So for web scrapping each district's cooordinates, I'll use wikipedia.**

In [None]:
# Creating a dataframe with a single colum 'district' to which I'll add latitudes and longitudes columns

district_names = list(census_2021_df['district'].unique())
district_names.sort()

district_coordinates = pd.DataFrame(district_names, columns=['district'])

In [None]:
latitudes = []
longitudes = []
provinces = []

scraped_districts_counter = 0
cannot_scrape_districts_counter = 0

province_successful = 0
province_unsuccessful = 0

for n, district in enumerate(district_coordinates['district']):
    if district == 'Bhojpur':
        link = 'https://en.wikipedia.org/wiki/Bhojpur_District,_Nepal'
    else:
        link = f"https://en.wikipedia.org/wiki/{district.title()}_District" #This is the website this script uses to scrape needed information
    
    try:
        html = requests.get(link) 
    except:
        print(f'Cannot establish connection to "{link}"')
        latitudes = []
        longitudes = []
        break
        
    #Turning response object fom "html" variable into beautifulsoup object to crawl through the site
    soup = BeautifulSoup(html.content, 'html.parser')

    try:
        coordinates = soup.body.find("span", {"id": "coordinates"}).span.span.find("span", {"class": "geo-dms"})
        
        latitude = coordinates.find("span", {"class": "latitude"})
        latitude = latitude.text.replace('N', '').replace('n', '').replace('″', '')
        d, intermediate = latitude.split('°')
        m , s = intermediate.split('′')
        if s == '':
            s = 0
        latitude = float(d) + (float(m)/60) + (float(s)/3600) #

        longitude = coordinates.find("span", {"class": "longitude"})
        longitude = longitude.text.replace('E', '').replace('e', '').replace('″', '')
        d, intermediate = longitude.split('°')
        m , s = intermediate.split('′')
        if s == '':
            s = 0
        longitude = float(d) + (float(m)/60) + (float(s)/3600)
        
        scraped_districts_counter += 1
    except:
        latitude = np.nan
        longitude = np.nan
        cannot_scrape_districts_counter += 1

    # coordinates_text = soup.select_one('/html/body/div[1]/div/div[3]/main/div[2]/div[3]/div[1]/p[26]/span/span/span/span/a/span[3]/span[1]')
    
    latitudes.append(latitude)
    longitudes.append(longitude)
    
    
    try:
        inner_province = soup.body.find("table", {"class": ['infobox', 'ib-settlement', 'vcard']}).tbody.find("tr", {"class": 'mergedrow'}).find("td", {"class": 'infobox-data'}).a.text.strip().title()
        province_successful += 1
    except:
        inner_province = np.nan
        province_unsuccessful += 1
    provinces.append(inner_province)
    
    clear_output(wait=True)
    print(f"Current link: {link}")
    print(f"Number of scrapped district's data:")
    print(f"\tCoordinates Successful = {scraped_districts_counter}\n")
    print(f"\tCoordinates Failed = {cannot_scrape_districts_counter}")
    print(f"\tProvinces Successful (Including Repitations) = {province_successful}")
    print(f"\tProvinces Failed = {province_unsuccessful}\n")

**14 district's website in wikipedia did not have its corresponding coordinates. Let's combine the recently created dataframe with 1 column with latitude and longitudes for corresponding districts.**

In [None]:
# Adding the columns
district_coordinates['latitude'] = latitudes
district_coordinates['longitude'] = longitudes
district_coordinates['province'] = provinces

In [None]:
district_coordinates

# 3.1. Data wrangling - Provinces

In [None]:
district_coordinates['province'].value_counts()

**There shouldn't be 'Coordination Committee'. So, find out the district of it and giving it it's proper province name.**

In [None]:
district_coordinates[district_coordinates['province'] == 'Coordination Committee']

**Jajarkot is in 'Karnali Province'. So, fixing it.**

In [None]:
district_coordinates.loc[26, 'province'] = 'Karnali Province'
district_coordinates.loc[26, 'province']

In [None]:
district_coordinates['province'].value_counts()

**All "Province No. 1" and "Koshi Pradesh" are actually "Koshi Province". So, fixing it.**

In [None]:
koshi_filtered = district_coordinates[(district_coordinates['province'] == 'Province No. 1') | (district_coordinates['province'] == 'Koshi Pradesh')].index

for each in koshi_filtered:
    district_coordinates.loc[each, 'province'] = 'Koshi Province'

In [None]:
district_coordinates['province'].value_counts()

**All "Province No. 2" are actually "Madhesh Province". So, fixing it.**

In [None]:
madesh_filtered = district_coordinates[(district_coordinates['province'] == 'Province No. 2')].index

for each in madesh_filtered:
    district_coordinates.loc[each, 'province'] = 'Madhesh Province'

In [None]:
district_coordinates['province'].value_counts()

In [None]:
print(f"{district_coordinates['province'].value_counts().sum()} districts have provinces declared and {district_coordinates['province'].isnull().sum()} don't!")

In [None]:
to_fill_province = district_coordinates[district_coordinates['province'].isnull()]
to_fill_province

In [None]:
# I'll manually find the provinces and fill them
to_fill_province_index = to_fill_province.index
corresponding_provinces_in_order = ['Koshi Province', 
                                    'Bagmati Province', 
                                    'Lumbini Province', 
                                    'Koshi Province', 
                                    'Koshi Province', 
                                    'Sudurpashchim Province', 
                                    'Koshi Province', 
                                    'Bagmati Province', 
                                    'Koshi Province', 
                                    'Gandaki Province', 
                                    'Gandaki Province', 
                                    'Lumbini Province', 
                                    'Koshi Province',
                                    'Lumbini Province', 
                                    'Karnali Province', 
                                    'Karnali Province', 
                                    'Koshi Province', 
                                    'Gandaki Province', 
                                    'Koshi Province']

for province_index, actual_val in zip(to_fill_province_index, corresponding_provinces_in_order):
    district_coordinates.loc[province_index, 'province'] = actual_val

In [None]:
district_coordinates['province'].value_counts()

# 3.2. Data wrangling - Coordinates

In [None]:
# Displaying rows with null values for latitude or longitude
to_fill_coordinates = district_coordinates[ (district_coordinates['latitude'].isnull()) | (district_coordinates['longitude'].isnull())]
to_fill_coordinates

**So, we need to find coordinates of these 14 districts manually. I'll find the latitudes and longitudes of corresponding districts from google map.**

In [None]:
to_fill_coordinates_index = to_fill_coordinates.index

to_fill_corresponding_latitudes_in_order = [27.108381,27.678777, 27.586242, 27.971370, 26.839804, 27.662786, 27.819579, 27.525530, 28.682423, 28.745739, 28.433012, 26.580362, 26.865413]
to_fill_corresponding_longitudes_in_order = [85.069835, 85.431211, 84.478300, 82.422960, 86.027870, 85.321435, 85.627741, 83.706729,82.795816, 82.429728, 82.153392, 86.719284, 86.674491]

for coordinate_index, lat, lon in zip(to_fill_coordinates_index, to_fill_corresponding_latitudes_in_order, to_fill_corresponding_longitudes_in_order):
    district_coordinates.loc[coordinate_index, 'latitude'] = lat
    district_coordinates.loc[coordinate_index, 'longitude'] = lon

In [None]:
district_coordinates.isnull().sum()

In [None]:
district_coordinates

# Scrapping District's Zone & Area

**I'll be web scrapping data of district's zone and area from <a href='https://kpadhne.com/77-districts-of-nepal/'>"kpadhne.com"</a>.**

In [None]:
link = 'https://kpadhne.com/77-districts-of-nepal/'

try:
    html = requests.get(link) 
except:
    print(f'Cannot establish connection to "{link}"')

#Turning response object fom "html" variable into beautifulsoup object to crawl through the site
soup1 = BeautifulSoup(html.content, 'html.parser')

district_with_headquarters = soup1.body.find("div", {"class": "entry-content"}).find_all('table')[10].find_all('tr')
district_and_headquarters = {}
for n, district in enumerate(district_with_headquarters):
    if n != 0:
        d_name = district.find_all('td')[0].text.strip().title()
        if d_name == 'Nawalpur':
            d_name = 'Nawalparasi East'
        elif d_name == 'Parasi':
            d_name = 'Nawalparasi West'
        elif d_name == 'Eastern Rukum':
            d_name = 'Rukum East'
        elif d_name == 'Western Rukum':
            d_name = 'Rukum West'
        elif d_name == 'Chitwan':
            d_name = 'Chitawan'
        elif d_name == 'Kapilvastu':
            d_name = 'Kapilbastu'
        elif d_name == 'Tanahun':
            d_name = 'Tanahu'
            
        d_headquarter = district.find_all('td')[1].text.strip().title()
        
        district_and_headquarters[d_name] = d_headquarter
        
district_and_headquarters = {k: district_and_headquarters[k] for k in sorted(district_and_headquarters.keys())}





district_with_area = soup1.body.find("div", {"class": "entry-content"}).find_all('table')[11].find_all('tr')
district_and_area = {}
for n, district in enumerate(district_with_area):
    if n != 0:
        d_name = district.find_all('td')[0].text.strip().title().split(' ')
        if len(d_name) == 2:
            d_name = d_name[0]
        elif len(d_name) == 3:
            d_name = f'{d_name[0]} {d_name[1]}'
            
        if d_name == 'Nawalpur':
            d_name = 'Nawalparasi East'
        elif d_name == 'Parasi':
            d_name = 'Nawalparasi West'
        elif d_name == 'Eastern Rukum':
            d_name = 'Rukum East'
        elif d_name == 'Western Rukum':
            d_name = 'Rukum West'
        elif d_name == 'Chitwan':
            d_name = 'Chitawan'
        elif d_name == 'Kapilvastu':
            d_name = 'Kapilbastu'
        elif d_name == 'Tanahun':
            d_name = 'Tanahu'
            
        d_area = district.find_all('td')[1].text.strip().title()
        
        district_and_area[d_name] = d_area
district_and_area = {k: district_and_area[k] for k in sorted(district_and_area.keys())}


In [None]:
# creating temporary dataframe out of scrapped district names, headquarters and areas
temp_df = pd.DataFrame({
                        'district': list(district_and_headquarters.keys()),
                        'district_headquarters': list(district_and_headquarters.values()),
                        'area_km_squared': list(district_and_area.values())
                       })                                    
# Performing left join on df1 and df2
district_df = pd.merge(district_coordinates, temp_df, on='district', how='left')

In [None]:
district_df

# Scrapping Region Information

**I'll be web scrapping data of district's zone and area from <a href='http://www.statoids.com/ynp.html'>"statoids.com"</a>.**

In [None]:
link = 'http://www.statoids.com/ynp.html'

try:
    html = requests.get(link) 
except:
    print(f'Cannot establish connection to "{link}"')

#Turning response object fom "html" variable into beautifulsoup object to crawl through the site
soup2 = BeautifulSoup(html.content, 'html.parser')

# district_with_headquarters = soup2.body.find("div", {"class": "entry-content"}).find_all('table')[10].find_all('tr')
district_statoids = soup2.body.find("table", {"class": "st"}).find_all('tr')

dta = []
for n, district in enumerate(district_statoids):
    if n == 0:
        continue
    elif n == 76:
        break
    each_data = district.find_all('td')
    each_data = [i.text for i in each_data]
    dta.append(each_data)

# creating a dataframe from the list
statoids_df = pd.DataFrame(dta)
# renaming the columns
statoids_df.columns = ['district', 'HASC', 'Reg', 'population_2011', 'population_2001', 'population_1991', 'population_1981', 'area_km_squared', 'capital', 'region', 'zone']
statoids_df_filtered = statoids_df[['district', 'area_km_squared', 'region', 'zone']]
statoids_df_filtered['district'] = statoids_df_filtered['district'].apply(lambda x: x.strip())

statoids_df_filtered['region'] = statoids_df_filtered['region'].apply(lambda x: "Hill" if x == "H" else ("Mountain" if x == "M" else ("Terai" if x == "T" else np.nan))
)
statoids_df_filtered['area_km_squared'] = statoids_df_filtered['area_km_squared'].apply(lambda x: x.replace(',', ''))

# sort the dataframe in alphabetical order
statoids_df_filtered = statoids_df_filtered.sort_values(by='district')


# removing 'Nawalparasi' and'Rukum' district as these districts are divided into 2 districts each (east and west)
to_remove_index = statoids_df_filtered[statoids_df_filtered['district'].isin(['Nawalparasi','Rukum'])].index
statoids_df_filtered = statoids_df_filtered.drop(to_remove_index)


In [None]:
# renaming some district names
to_rename = statoids_df_filtered[statoids_df_filtered['district'].isin(['Chitwan','Dang Deokhuri', 'Kapilvastu'])].index

for n, each in enumerate(to_rename):
    if n == 0:
        statoids_df_filtered.loc[each, 'district'] = 'Chitawan'
    elif n == 1:
        statoids_df_filtered.loc[each, 'district'] = 'Dang'
    elif n == 2:
        statoids_df_filtered.loc[each, 'district'] = 'Kapilbastu'
        
statoids_df_filtered

**Now I need to add rows for following districts:**

**1. 'Nawalparasi East'**

**2. 'Western Rukum'**

**3. 'Rukum East'**

**4. 'Rukum West'**

In [None]:
rows = [
    ['Nawalparasi East', '1126', 'Terai', 'Lumbini'],
    ['Nawalparasi West', '1162', 'Terai', 'Lumbini'],
    ['Rukum East', '1168', 'Hill', 'Rapti'],
    ['Rukum West', '1152', 'Hill', 'Rapti']
]
# create a new DataFrame with the new rows
new_df = pd.DataFrame(rows, columns=['district', 'area_km_squared', 'region', 'zone'])

# append the new DataFrame to the existing one
statoids_df_filtered = statoids_df_filtered.append(new_df, ignore_index=True)

statoids_df_final = statoids_df_filtered.sort_values(by=['district'], ascending=True)

statoids_df_final

In [None]:
statoids_df_final.isnull().sum()

**Now I'll left join "district_df" and "statoids_df_final"!**

In [None]:
# performing a left join on the 'key' column
district_data_df = pd.merge(district_df, statoids_df_final[['district', 'region', 'zone']], on='district', how='left')

In [None]:
# Reset the index
district_data_df = district_data_df.reset_index(drop=True)

In [None]:
# convert col1 to float
district_data_df['area_km_squared'] = district_data_df['area_km_squared'].astype(float)

In [None]:
district_data_df

In [None]:
census_2021_df

# Database Schema & Design

**The database schema design was created using <a href='https://drawsql.app'>drawsql.app</a>. A SQLite database was created following schema design given below:**

In [None]:
from IPython import display
display.Image("/kaggle/input/nepal-database-schema/Nepal_DB_Schema.PNG")

**Currently there is 2 dataframes (tables):**

**1. 2021 Census Population Data**

**2. District Data**

In [None]:
districtsData = {
    'district_id': [n+1 for n, _ in enumerate(district_data_df['district'])],
    'district': district_data_df['district']
}
districts = pd.DataFrame(districtsData)
districts

In [None]:
# Dropping the district column to add district_id column instead
district_data_df = district_data_df.drop(columns=['district'])

# Insert the new district_id at index 0
district_data_df.insert(0, 'district_id', districts['district_id'])

district_data_df

In [None]:
temp_dict = {}
for n, d in enumerate(list(districts['district'])):
    temp_dict[d] = n+1
    
census_2021_df[census_2021_df['district']=='Achham'].shape[0]
census_2021_df['district'] = census_2021_df['district'].apply(lambda x: temp_dict[x])

# Rename column
census_2021_df.rename(columns={'district': 'district_id'}, inplace=True)

census_2021_df

**I have transformed the dataframes according to schema. Now I'll create a sqlite database and store the three tables in it.**

In [None]:
import sqlite3

# Creating a connection to an SQLite database
conn = sqlite3.connect('nepal.db')

# Saving the DataFrames to the database

# First saving districts dataframe as districts table
districts.to_sql(
    'districts', 
    conn, 
    if_exists='replace', 
    index=False, 
    dtype={
        'district_id': 'tinyint NOT NULL',
        'district': 'varchar(20)'
    }
)


# First saving districts dataframe as districts table
district_data_df.to_sql(
    'district_info', 
    conn, 
    if_exists='replace', 
    index=False, 
    dtype={
        'district_id': 'tinyint NOT NULL',
        'latitude': 'decimal',
        'longitude': 'decimal',
        'province': 'varchar(25) NOT NULL',
        'district_headquarters': 'varchar(20)',
        'area_km_squared': 'decimal',
        'region': 'varchar(10) NOT NULL',
        'zone': 'varchar(15) NOT NULL',
    }
)


# First saving districts dataframe as districts table
census_2021_df.to_sql(
    'census_population_2021', 
    conn, 
    if_exists='replace', 
    index=False, 
    dtype={
        'district_id': 'tinyint NOT NULL',
        'local_government_name': 'varchar(50)',
        'local_government_type': 'varchar(40)',
        'total_family_number': 'int',
        'total_household_number': 'int',
        'total_population_number': 'int',
        'total_male_number': 'int',
        'total_female_number': 'int',
    }
)

# Close the connection
conn.close()

**I have successfully create the database according to the schema! I think it would be best if I also save the dataframes as csv.**

In [None]:
# saving all 3 DataFrame to CSV file

districts.to_csv('districts.csv', index=False)
district_data_df.to_csv('district_info.csv', index=False)
census_2021_df.to_csv('census_population_2021.csv', index=False)