In [5]:
#RESHAPE DATA
#convert wide format to long format
import pandas as pd

print("Step 1: Loading the data...")

# load the CSV file
data = pd.read_csv('data/gdp_with_continent_filled.csv')

print(f"Loaded {len(data)} countries")
print(f"Data from year 1960 to 2024")

print("\nFirst 3 rows (showing first 10 columns only):")
print(data.iloc[:3, :10])

Step 1: Loading the data...
Loaded 266 countries
Data from year 1960 to 2024

First 3 rows (showing first 10 columns only):
                  Country Name Country Code     Indicator Name  \
0                        Aruba          ABW  GDP (current US$)   
1  Africa Eastern and Southern          AFE  GDP (current US$)   
2                  Afghanistan          AFG  GDP (current US$)   

   Indicator Code          1960          1961          1962          1963  \
0  NY.GDP.MKTP.CD           NaN           NaN           NaN           NaN   
1  NY.GDP.MKTP.CD  2.420569e+10  2.495889e+10  2.707323e+10  3.176914e+10   
2  NY.GDP.MKTP.CD           NaN           NaN           NaN           NaN   

           1964          1965  
0           NaN           NaN  
1  3.027955e+10  3.380618e+10  
2           NaN           NaN  


In [6]:
# ============================================
# STEP 2: RESHAPE DATA FROM WIDE TO LONG
# ============================================

print("\nStep 2: Reshaping the data...")

# Keep only the columns we need
# We want: Country Name, Continent, and all year columns

# First, let's identify year columns (they are numbers like '1960', '2020')
year_columns = [col for col in data.columns if col.isdigit()]

print(f"Found {len(year_columns)} year columns")
print(f"From {year_columns[0]} to {year_columns[-1]}")

# Create list of columns to keep
id_columns = ['Country Name', 'Continent']

# Melt (reshape) the data
# This converts: Country | 1960 | 1961 | 2020
# Into: Country | Year | Value
data_long = pd.melt(
    data,
    id_vars=id_columns,           # Keep these columns as is
    value_vars=year_columns,       # Convert these to rows
    var_name='Year',               # Name for the year column
    value_name='Value'             # Name for the GDP value column
)

print(" Data reshaped!")
print(f"New shape: {len(data_long)} rows")

# Let's see the new format
print("\nNew data format (first 10 rows):")
print(data_long.head(10))


Step 2: Reshaping the data...
Found 65 year columns
From 1960 to 2024
 Data reshaped!
New shape: 17290 rows

New data format (first 10 rows):
                  Country Name      Continent  Year         Value
0                        Aruba  North America  1960           NaN
1  Africa Eastern and Southern         Africa  1960  2.420569e+10
2                  Afghanistan           Asia  1960           NaN
3   Africa Western and Central         Africa  1960  1.190481e+10
4                       Angola         Africa  1960           NaN
5                      Albania         Europe  1960           NaN
6                      Andorra         Europe  1960           NaN
7                   Arab World           Asia  1960           NaN
8         United Arab Emirates           Asia  1960           NaN
9                    Argentina  South America  1960  1.586547e+10


In [7]:
# ============================================
# STEP 3: CLEAN THE DATA
# Remove empty values and fix data types
# ============================================

print("Step 3: Cleaning the data...")

# Count before cleaning
rows_before = len(data_long)

# Remove rows where Value is empty (NaN)
data_clean = data_long.dropna(subset=['Value'])

# Remove rows where Value is 0 (optional, some countries have 0 GDP)
# data_clean = data_clean[data_clean['Value'] > 0]

# Convert Year to integer (it's currently text like '2020')
data_clean['Year'] = data_clean['Year'].astype(int)

# Convert Value to float (decimal number)
data_clean['Value'] = data_clean['Value'].astype(float)

# Add a Region column based on Continent
# (Since your config uses "South Asia" but data has "Asia")
# We'll create a mapping

# For now, let's rename Continent to Region to match our config
data_clean = data_clean.rename(columns={'Continent': 'Region'})

rows_after = len(data_clean)
removed = rows_before - rows_after

print(f"Cleaning complete!")
print(f"Rows before: {rows_before}")
print(f"Rows after: {rows_after}")
print(f"Removed: {removed} empty rows")

print("\nCleaned data (first 10 rows):")
print(data_clean.head(10))

print("\nData types:")
print(data_clean.dtypes)

Step 3: Cleaning the data...
Cleaning complete!
Rows before: 17290
Rows after: 14561
Removed: 2729 empty rows

Cleaned data (first 10 rows):
                   Country Name         Region  Year         Value
1   Africa Eastern and Southern         Africa  1960  2.420569e+10
3    Africa Western and Central         Africa  1960  1.190481e+10
9                     Argentina  South America  1960  1.586547e+10
13                    Australia        Oceania  1960  1.863568e+10
14                      Austria         Europe  1960  6.624086e+09
16                      Burundi         Africa  1960  1.960000e+08
17                      Belgium         Europe  1960  1.181062e+10
18                        Benin         Africa  1960  2.261956e+08
19                 Burkina Faso         Africa  1960  3.304428e+08
20                   Bangladesh           Asia  1960  4.274894e+09

Data types:
Country Name        str
Region              str
Year              int64
Value           float64
dtype: object

In [8]:
# ============================================
# STEP 4: SAVE THE CLEANED DATA
# This makes it easier to use later
# ============================================

print("Step 4: Saving cleaned data...")

# Save to a new CSV file
data_clean.to_csv('data/gdp_data_clean.csv', index=False)

print("Saved as 'gdp_data_clean.csv'")
print("This file is now ready to use!")

# Let's see what regions we have
print("\nAvailable Regions:")
regions = data_clean['Region'].unique()
for region in regions:
    countries_in_region = data_clean[data_clean['Region'] == region]['Country Name'].nunique()
    print(f"  - {region}: {countries_in_region} countries")

# Let's see what years we have
print("\nAvailable Years:")
years = sorted(data_clean['Year'].unique())
print(f"From {years[0]} to {years[-1]}")

Step 4: Saving cleaned data...
Saved as 'gdp_data_clean.csv'
This file is now ready to use!

Available Regions:
  - Africa: 59 countries
  - South America: 15 countries
  - Oceania: 21 countries
  - Europe: 55 countries
  - Asia: 56 countries
  - North America: 35 countries
  - Global: 21 countries

Available Years:
From 1960 to 2024


In [10]:
from data_loader import load_and_prepare_gdp_data, load_config

data = load_and_prepare_gdp_data()
config = load_config()

print(data.head())
print(data.dtypes)
print(config)


ModuleNotFoundError: No module named 'data_loader'