In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

import requests

import os
import sys

In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


# Create a Spark session
spark = SparkSession.builder.appName("PivotGDPData") \
                    .config("spark.network.timeout", "600s") \
                    .config("spark.executor.heartbeatInterval", "120s") \
                    .config("spark.jars", "C:\Spark\jars\postgresql-42.6.0.jar") \
                    .getOrCreate()

In [3]:
# Base URL for the World Bank API
base_url = "http://api.worldbank.org/v2/country"
indicator = "NY.GDP.MKTP.CD"
params = {"format": "json"}

# Get the list of all country codes
response = requests.get(base_url, params=params)
data = response.json()
country_list = data[1]


In [4]:
# country_list

[{'id': 'ABW',
  'iso2Code': 'AW',
  'name': 'Aruba',
  'region': {'id': 'LCN',
   'iso2code': 'ZJ',
   'value': 'Latin America & Caribbean '},
  'adminregion': {'id': '', 'iso2code': '', 'value': ''},
  'incomeLevel': {'id': 'HIC', 'iso2code': 'XD', 'value': 'High income'},
  'lendingType': {'id': 'LNX', 'iso2code': 'XX', 'value': 'Not classified'},
  'capitalCity': 'Oranjestad',
  'longitude': '-70.0167',
  'latitude': '12.5167'},
 {'id': 'AFE',
  'iso2Code': 'ZH',
  'name': 'Africa Eastern and Southern',
  'region': {'id': 'NA', 'iso2code': 'NA', 'value': 'Aggregates'},
  'adminregion': {'id': '', 'iso2code': '', 'value': ''},
  'incomeLevel': {'id': 'NA', 'iso2code': 'NA', 'value': 'Aggregates'},
  'lendingType': {'id': '', 'iso2code': '', 'value': 'Aggregates'},
  'capitalCity': '',
  'longitude': '',
  'latitude': ''},
 {'id': 'AFG',
  'iso2Code': 'AF',
  'name': 'Afghanistan',
  'region': {'id': 'SAS', 'iso2code': '8S', 'value': 'South Asia'},
  'adminregion': {'id': 'SAS', 'iso

In [5]:
# Create a list to hold the data
gdp_data = []

# Iterate through each country and replace "BRA" with the country code
for country in country_list:
    country_code = country["id"]
    
    # Construct the API URL with the specific country code
    api_url = f"{base_url}/{country_code}/indicator/{indicator}"
   
    
    # Make the API request
    response = requests.get(api_url, params=params)
    data = response.json()
    
    # Extract GDP data if available
    if data[1]:
        entries = data[1]
        country_name = entries[0]["country"]["value"]
        for entry in entries: 
            gdp_value = entry["value"]
            if gdp_value is not None:
                gdp_value = round(entry["value"] / 10**8, 3)
            else: 
                gdp_value = 0.0

            Year = entry["date"]
            
            if(int(Year) > 2011): 
                gdp_data.append((Year, country_name, country_code, float(gdp_value)))
            else: 
                continue

# # Create a DataFrame from the collected data
# columns = ["Year", "CountryName", "CountryCode", "GDPValue in 10^8"]
# df = spark.createDataFrame(gdp_data, columns)

# # Pivot the DataFrame to have separate columns for GDP values for each year
# pivoted_df = df.groupBy("CountryName", "CountryCode") \
#                .pivot("Year") \
#                .agg({"GDPValue in 10^8": "first"})

# # Show the pivoted DataFrame
# pivoted_df.show()

In [6]:
# print("hello")

hello


In [7]:
# Create a DataFrame from the collected data
columns = ["Year", "CountryName", "CountryCode", "GDPValue in 10^8"]
df = spark.createDataFrame(gdp_data, columns)


In [8]:
df.show()

+----+--------------------+-----------+----------------+
|Year|         CountryName|CountryCode|GDPValue in 10^8|
+----+--------------------+-----------+----------------+
|2022|               Aruba|        ABW|             0.0|
|2021|               Aruba|        ABW|           31.26|
|2020|               Aruba|        ABW|            26.1|
|2019|               Aruba|        ABW|          33.958|
|2018|               Aruba|        ABW|          32.762|
|2017|               Aruba|        ABW|          30.922|
|2016|               Aruba|        ABW|          29.838|
|2015|               Aruba|        ABW|          29.631|
|2014|               Aruba|        ABW|          27.911|
|2013|               Aruba|        ABW|          27.279|
|2012|               Aruba|        ABW|          26.151|
|2022|Africa Eastern an...|        AFE|       11694.837|
|2021|Africa Eastern an...|        AFE|       10819.981|
|2020|Africa Eastern an...|        AFE|        9275.933|
|2019|Africa Eastern an...|    

In [9]:
# Pivot the DataFrame to have separate columns for GDP values for each year
pivoted_df = df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GDPValue in 10^8": "first"})

# Show the pivoted DataFrame
pivoted_df.show()

# JDBC URL for Postgres
jdbc_url = "jdbc:postgresql://localhost:59064/worldBank"

# Connection properties
properties = {
    "user": "",
    "password": "",
    "driver": ""
}

# Save DataFrame to Postgres
pivoted_df.write.jdbc(url=jdbc_url, table="gdp_data", mode="overwrite", properties=properties)

+--------------------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|         CountryName|CountryCode|     2012|     2013|     2014|     2015|     2016|     2017|     2018|     2019|     2020|     2021|     2022|
+--------------------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
|              Bhutan|        BTN|   17.813|   17.562|   19.071|   20.036|    21.59|   24.504|   24.469|   25.357|   23.252|   25.396|      0.0|
|               Aruba|        ABW|   26.151|   27.279|   27.911|   29.631|   29.838|   30.922|   32.762|   33.958|     26.1|    31.26|      0.0|
|     Channel Islands|        CHI|      0.0|      0.0|      0.0|      0.0|   95.112|   95.302|  104.229|  103.817|   98.115|  117.357|      0.0|
|        Bahamas, The|        BHS|  107.205|  105.628|  111.761|  116.729|  117.508|  122.535|  126.536|  130.587|   97.546|  115.

In [10]:
import requests

# Define API endpoint and parameters
base_url = "http://api.worldbank.org/v2/"
indicator_id = "NY.GNP.PCAP.CD"  # GNI per capita indicator
year = "2020"
format_type = "json"

# Construct API URL
api_url = f"{base_url}countries/all/indicators/{indicator_id}?date={year}&format={format_type}"

# Make API request
response = requests.get(api_url)
data = response.json()

# Extract and print GNI data for each country
for entry in data[1]:
    country = entry["country"]["value"]
    gni_value = entry["value"]
    print(f"Country: {country}, GNI per capita ({year}): {gni_value}")


Country: Africa Eastern and Southern, GNI per capita (2020): 1388.53122528792
Country: Africa Western and Central, GNI per capita (2020): 1664.74271787706
Country: Arab World, GNI per capita (2020): 5949.45690542345
Country: Caribbean small states, GNI per capita (2020): 8915.77791531801
Country: Central Europe and the Baltics, GNI per capita (2020): 15842.7748027733
Country: Early-demographic dividend, GNI per capita (2020): 3289.17195843465
Country: East Asia & Pacific, GNI per capita (2020): 11642.0846705479
Country: East Asia & Pacific (excluding high income), GNI per capita (2020): 8333.60131429821
Country: East Asia & Pacific (IDA & IBRD countries), GNI per capita (2020): 8426.17888753136
Country: Euro area, GNI per capita (2020): 38210.5360680062
Country: Europe & Central Asia, GNI per capita (2020): 24027.0732995063
Country: Europe & Central Asia (excluding high income), GNI per capita (2020): 7723.39836908187
Country: Europe & Central Asia (IDA & IBRD countries), GNI per capit

In [11]:
data[1]

[{'indicator': {'id': 'NY.GNP.PCAP.CD',
   'value': 'GNI per capita, Atlas method (current US$)'},
  'country': {'id': 'ZH', 'value': 'Africa Eastern and Southern'},
  'countryiso3code': 'AFE',
  'date': '2020',
  'value': 1388.53122528792,
  'unit': '',
  'obs_status': '',
  'decimal': 0},
 {'indicator': {'id': 'NY.GNP.PCAP.CD',
   'value': 'GNI per capita, Atlas method (current US$)'},
  'country': {'id': 'ZI', 'value': 'Africa Western and Central'},
  'countryiso3code': 'AFW',
  'date': '2020',
  'value': 1664.74271787706,
  'unit': '',
  'obs_status': '',
  'decimal': 0},
 {'indicator': {'id': 'NY.GNP.PCAP.CD',
   'value': 'GNI per capita, Atlas method (current US$)'},
  'country': {'id': '1A', 'value': 'Arab World'},
  'countryiso3code': 'ARB',
  'date': '2020',
  'value': 5949.45690542345,
  'unit': '',
  'obs_status': '',
  'decimal': 0},
 {'indicator': {'id': 'NY.GNP.PCAP.CD',
   'value': 'GNI per capita, Atlas method (current US$)'},
  'country': {'id': 'S3', 'value': 'Caribbe

In [12]:
data

[{'page': 1,
  'pages': 6,
  'per_page': 50,
  'total': 266,
  'sourceid': '2',
  'lastupdated': '2023-07-25'},
 [{'indicator': {'id': 'NY.GNP.PCAP.CD',
    'value': 'GNI per capita, Atlas method (current US$)'},
   'country': {'id': 'ZH', 'value': 'Africa Eastern and Southern'},
   'countryiso3code': 'AFE',
   'date': '2020',
   'value': 1388.53122528792,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {'indicator': {'id': 'NY.GNP.PCAP.CD',
    'value': 'GNI per capita, Atlas method (current US$)'},
   'country': {'id': 'ZI', 'value': 'Africa Western and Central'},
   'countryiso3code': 'AFW',
   'date': '2020',
   'value': 1664.74271787706,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {'indicator': {'id': 'NY.GNP.PCAP.CD',
    'value': 'GNI per capita, Atlas method (current US$)'},
   'country': {'id': '1A', 'value': 'Arab World'},
   'countryiso3code': 'ARB',
   'date': '2020',
   'value': 5949.45690542345,
   'unit': '',
   'obs_status': '',
   'decimal': 0},
  {

In [38]:
base_url = "http://api.worldbank.org/v2/country"
indicator_id = "NY.GNP.PCAP.CD"
gni_data = []

for country in country_list:
    country_code = country["id"]
    
    # Construct the API URL with the specific country code
    api_url = f"{base_url}/{country_code}/indicator/{indicator_id}"
   
    
    # Make the API request
    response = requests.get(api_url, params=params)
    data = response.json()

    if data[1]:
        entries = data[1]
        country_name = entries[0]["country"]["value"]
        for entry in entries: 
            gni_value = entry["value"]
            if gni_value is not None :
                gni_value = entry["value"] 
            else: 
                gni_value = 0.0

            Year = entry["date"]
            
            if(int(Year) > 2011): 
                gni_data.append((Year, country_name, country_code, float(gni_value)))
            else: 
                continue


In [28]:
columns = ["Year", "CountryName", "CountryCode", "GNI"]
df = spark.createDataFrame(gni_data, columns)


In [30]:
df.show()

+----+--------------------+-----------+----------------+
|Year|         CountryName|CountryCode|             GNI|
+----+--------------------+-----------+----------------+
|2022|               Aruba|        ABW|             0.0|
|2021|               Aruba|        ABW|         29460.0|
|2020|               Aruba|        ABW|         24840.0|
|2019|               Aruba|        ABW|         30330.0|
|2018|               Aruba|        ABW|         29310.0|
|2017|               Aruba|        ABW|         27720.0|
|2016|               Aruba|        ABW|         26650.0|
|2015|               Aruba|        ABW|         26180.0|
|2014|               Aruba|        ABW|         25790.0|
|2013|               Aruba|        ABW|         25500.0|
|2012|               Aruba|        ABW|         24440.0|
|2022|Africa Eastern an...|        AFE|1542.26098460035|
|2021|Africa Eastern an...|        AFE|1461.38801895754|
|2020|Africa Eastern an...|        AFE|1388.53122528792|
|2019|Africa Eastern an...|    

In [31]:
pivoted_df = df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GNI": "first"})

# Show the pivoted DataFrame
pivoted_df.show()

+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+
|         CountryName|CountryCode|            2012|            2013|            2014|            2015|            2016|            2017|            2018|           2019|            2020|            2021|            2022|
+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+----------------+
|              Bhutan|        BTN|          2330.0|          2350.0|          2400.0|          2470.0|          2600.0|          2760.0|          2940.0|         3130.0|          2840.0|          3040.0|             0.0|
|               Aruba|        ABW|         24440.0|         25500.0|         25790.0|         26180.0|         26650

In [33]:
dbc_url = "jdbc:postgresql://localhost:59064/worldBank"

# Connection properties
properties = {
    "user": "",
    "password": "",
    "driver": ""
}

# Save DataFrame to Postgres
pivoted_df.write.jdbc(url=jdbc_url, table="gni", mode="overwrite", properties=properties)

In [39]:
# Define the income group thresholds
low_income_threshold = 1045
lower_middle_income_threshold = 4095
upper_middle_income_threshold = 12695

# Create DataFrames for different income groups
low_income_df = df.filter(df["GNI"] <= low_income_threshold)
lower_middle_income_df = df.filter((df["GNI"] > low_income_threshold) & (df["GNI"] <= lower_middle_income_threshold))
upper_middle_income_df = df.filter((df["GNI"] > lower_middle_income_threshold) & (df["GNI"] <= upper_middle_income_threshold))
high_income_df = df.filter(df["GNI"] > upper_middle_income_threshold)

# Show the content of each DataFrame (for illustration purposes)
print("Low Income DataFrame:")
low_income_df.show()

print("Lower Middle Income DataFrame:")
lower_middle_income_df.show()

print("Upper Middle Income DataFrame:")
upper_middle_income_df.show()

print("High Income DataFrame:")
high_income_df.show()

Low Income DataFrame:
+----+-----------+-----------+-----+
|Year|CountryName|CountryCode|  GNI|
+----+-----------+-----------+-----+
|2022|      Aruba|        ABW|  0.0|
|2022|Afghanistan|        AFG|  0.0|
|2021|Afghanistan|        AFG|390.0|
|2020|Afghanistan|        AFG|500.0|
|2019|Afghanistan|        AFG|530.0|
|2018|Afghanistan|        AFG|520.0|
|2017|Afghanistan|        AFG|540.0|
|2016|Afghanistan|        AFG|570.0|
|2015|Afghanistan|        AFG|610.0|
|2014|Afghanistan|        AFG|650.0|
|2013|Afghanistan|        AFG|680.0|
|2012|Afghanistan|        AFG|650.0|
|2022|    Andorra|        AND|  0.0|
|2021|    Andorra|        AND|  0.0|
|2020|    Andorra|        AND|  0.0|
|2018|    Andorra|        AND|  0.0|
|2017|    Andorra|        AND|  0.0|
|2016|    Andorra|        AND|  0.0|
|2015|    Andorra|        AND|  0.0|
|2014|    Andorra|        AND|  0.0|
+----+-----------+-----------+-----+
only showing top 20 rows

Lower Middle Income DataFrame:
+----+--------------------+------

In [57]:
low_income_df = low_income_df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GNI": "first"})

# Show the pivoted DataFrame
low_income_df.show()
# row_condition =col("CountryName") == "Bhutan"
# f = low_income_df.filter(row_condition)
# f.show()

+--------------------+-----------+-----+------+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|         CountryName|CountryCode| 2012|  2013| 2014| 2015| 2016| 2017| 2018| 2019| 2020| 2021| 2022|
+--------------------+-----------+-----+------+-----+-----+-----+-----+-----+-----+-----+-----+-----+
|              Bhutan|        BTN| null|  null| null| null| null| null| null| null| null| null|  0.0|
|               Aruba|        ABW| null|  null| null| null| null| null| null| null| null| null|  0.0|
|     Channel Islands|        CHI|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  0.0|  0.0|  0.0|  0.0|  0.0|
|             Andorra|        AND|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  0.0| null|  0.0|  0.0|  0.0|
|             Burundi|        BDI|240.0| 240.0|250.0|250.0|250.0|250.0|240.0|230.0|220.0|220.0|240.0|
|         Afghanistan|        AFG|650.0| 680.0|650.0|610.0|570.0|540.0|520.0|530.0|500.0|390.0|  0.0|
|      American Samoa|        ASM|  0.0|   0.0|  0.0|  0.0|  0.0|  0.0|  0.0|  0.0

In [58]:
lower_middle_income_df = lower_middle_income_df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GNI": "first"})

# Show the pivoted DataFrame
lower_middle_income_df.show()

+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|         CountryName|CountryCode|            2012|            2013|            2014|            2015|            2016|            2017|            2018|            2019|            2020|            2021|            2022|
+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|              Bhutan|        BTN|          2330.0|          2350.0|          2400.0|          2470.0|          2600.0|          2760.0|          2940.0|          3130.0|          2840.0|          3040.0|            null|
|          Azerbaijan|        AZE|            null|            null|            null|            null|          

In [56]:
selected_column = "CountryName"
selected_column_df = lower_middle_income_df.select(selected_column).distinct()
selected_column_df.show()

+--------------------+
|         CountryName|
+--------------------+
|              Angola|
|Africa Eastern an...|
|Africa Western an...|
|             Armenia|
|               Benin|
|          Azerbaijan|
|          Bangladesh|
|             Bolivia|
|              Bhutan|
+--------------------+



In [59]:
upper_middle_income_df = upper_middle_income_df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GNI": "first"})

# Show the pivoted DataFrame
upper_middle_income_df.show()

+--------------------+-----------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+---------------+----------------+----------------+----------------+
|         CountryName|CountryCode|            2012|            2013|            2014|            2015|           2016|            2017|            2018|           2019|            2020|            2021|            2022|
+--------------------+-----------+----------------+----------------+----------------+----------------+---------------+----------------+----------------+---------------+----------------+----------------+----------------+
|          Azerbaijan|        AZE|          6480.0|          7450.0|          7740.0|          6610.0|         4790.0|          4110.0|            null|         4510.0|          4480.0|          4910.0|          5630.0|
|Bosnia and Herzeg...|        BIH|          4900.0|          5160.0|          5170.0|          5130.0|         5060.0|  

In [60]:
high_income_df = high_income_df.groupBy("CountryName", "CountryCode") \
               .pivot("Year") \
               .agg({"GNI": "first"})

# Show the pivoted DataFrame
high_income_df.show()

+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|         CountryName|CountryCode|            2012|            2013|            2014|            2015|            2016|            2017|            2018|            2019|            2020|            2021|            2022|
+--------------------+-----------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+----------------+
|               Aruba|        ABW|         24440.0|         25500.0|         25790.0|         26180.0|         26650.0|         27720.0|         29310.0|         30330.0|         24840.0|         29460.0|            null|
|        Bahamas, The|        BHS|         27200.0|         26280.0|         26840.0|         27270.0|         2

In [61]:
!pip install django

Collecting django
  Obtaining dependency information for django from https://files.pythonhosted.org/packages/7f/9e/fc6bab255ae10bc57fa2f65646eace3d5405fbb7f5678b90140052d1db0f/Django-4.2.4-py3-none-any.whl.metadata
  Downloading Django-4.2.4-py3-none-any.whl.metadata (4.1 kB)
Collecting asgiref<4,>=3.6.0 (from django)
  Obtaining dependency information for asgiref<4,>=3.6.0 from https://files.pythonhosted.org/packages/9b/80/b9051a4a07ad231558fcd8ffc89232711b4e618c15cb7a392a17384bbeef/asgiref-3.7.2-py3-none-any.whl.metadata
  Downloading asgiref-3.7.2-py3-none-any.whl.metadata (9.2 kB)
Collecting sqlparse>=0.3.1 (from django)
  Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)
     ---------------------------------------- 0.0/41.2 kB ? eta -:--:--
     ----------------------------- ---------- 30.7/41.2 kB ? eta -:--:--
     -------------------------------------- 41.2/41.2 kB 672.7 kB/s eta 0:00:00
Collecting tzdata (from django)
  Using cached tzdata-2023.3-py2.py3-none-any.whl (341 k

In [2]:
!django-admin startproject economic_dashboard


CommandError: 'c:\Users\sumit sharan\OneDrive\Desktop\ds project\ETL\economic_dashboard' already exists


In [3]:
!cd economic_dashboard
!python manage.py runserver

python: can't open file 'c:\Users\sumit sharan\OneDrive\Desktop\ds project\ETL\manage.py': [Errno 2] No such file or directory


In [72]:
!cd economic_dashboard
!python .\economic_dashboard\manage.py startapp dashboard


CommandError: 'c:\Users\sumit sharan\OneDrive\Desktop\ds project\ETL\dashboard' already exists


In [71]:
!pip install psycopg2


Collecting psycopg2
  Obtaining dependency information for psycopg2 from https://files.pythonhosted.org/packages/e4/0d/b807180308d543de909fe87e5095e86c5bb58a04cdfcde6267f97da60aff/psycopg2-2.9.7-cp39-cp39-win_amd64.whl.metadata
  Downloading psycopg2-2.9.7-cp39-cp39-win_amd64.whl.metadata (4.6 kB)
Downloading psycopg2-2.9.7-cp39-cp39-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ------ --------------------------------- 0.2/1.2 MB 12.3 MB/s eta 0:00:01
   ------------ --------------------------- 0.4/1.2 MB 4.5 MB/s eta 0:00:01
   ----------------- ---------------------- 0.5/1.2 MB 3.9 MB/s eta 0:00:01
   ------------------ --------------------- 0.6/1.2 MB 3.2 MB/s eta 0:00:01
   ------------------------ --------------- 0.7/1.2 MB 3.2 MB/s eta 0:00:01
   ------------------------------ --------- 0.9/1.2 MB 3.3 MB/s eta 0:00:01
   -------------------------------------- - 1.1/1.2 MB 3.5 MB/s eta 0:00:01
   ------------------------------------

In [6]:
import django


ModuleNotFoundError: No module named 'django'