In [3]:
import pandas as pd

# Load the file
raw_path = "../data/eurostat_gdp_raw.csv"  # or .tsv if that's what you saved
df_raw = pd.read_csv(raw_path, sep=',')  # Try sep='\t'; if messy, try ','

# Show dimensions and columns
print("Shape:", df_raw.shape)
print("Columns:", df_raw.columns.tolist())

# Preview
df_raw.head()


Shape: (3, 54)
Columns: ['freq', 'unit', 'na_item', 'geo\\TIME_PERIOD', '1975 ', '1976 ', '1977 ', '1978 ', '1979 ', '1980 ', '1981 ', '1982 ', '1983 ', '1984 ', '1985 ', '1986 ', '1987 ', '1988 ', '1989 ', '1990 ', '1991 ', '1992 ', '1993 ', '1994 ', '1995 ', '1996 ', '1997 ', '1998 ', '1999 ', '2000 ', '2001 ', '2002 ', '2003 ', '2004 ', '2005 ', '2006 ', '2007 ', '2008 ', '2009 ', '2010 ', '2011 ', '2012 ', '2013 ', '2014 ', '2015 ', '2016 ', '2017 ', '2018 ', '2019 ', '2020 ', '2021 ', '2022 ', '2023 ', '2024 ']


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,CP_MEUR,B1GQ,DE,:,:,:,:,:,:,...,3085650.0,3196110.0,3331110.0,3431130.0,3534880.0,3449620.0,3676460.0 p,3953850.0 p,4185550.0 p,4305260.0 p
1,A,CP_MEUR,B1GQ,EA19,:,:,:,:,:,:,...,10615103.7,10906041.2,11316197.6,11690853.7,12082038.6,11564471.2,12555194.4,13655302.9,14526703.1,15071879.9
2,A,CP_MEUR,B1GQ,FR,288049.5,331012.5,355596.4,394484.3,443872.1,500064.0,...,2201401.6,2231819.2,2291680.5,2355362.8,2432206.8,2318276.2,2508102.3,2653997.2,2826541.5 p,2919899.9 p


In [4]:
import pandas as pd

# Load data and strip whitespace from column headers
df_raw.columns = df_raw.columns.str.strip()  # Clean up trailing spaces in headers
df_raw.head()


Unnamed: 0,freq,unit,na_item,geo\TIME_PERIOD,1975,1976,1977,1978,1979,1980,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,CP_MEUR,B1GQ,DE,:,:,:,:,:,:,...,3085650.0,3196110.0,3331110.0,3431130.0,3534880.0,3449620.0,3676460.0 p,3953850.0 p,4185550.0 p,4305260.0 p
1,A,CP_MEUR,B1GQ,EA19,:,:,:,:,:,:,...,10615103.7,10906041.2,11316197.6,11690853.7,12082038.6,11564471.2,12555194.4,13655302.9,14526703.1,15071879.9
2,A,CP_MEUR,B1GQ,FR,288049.5,331012.5,355596.4,394484.3,443872.1,500064.0,...,2201401.6,2231819.2,2291680.5,2355362.8,2432206.8,2318276.2,2508102.3,2653997.2,2826541.5 p,2919899.9 p


In [5]:
# Melt all year columns into rows
year_cols = [col for col in df_raw.columns if col.isnumeric()]  # Columns like '1975', '1976', ...
df_melted = df_raw.melt(
    id_vars=['geo\\TIME_PERIOD'],  # this holds country code
    value_vars=year_cols,
    var_name='year',
    value_name='value'
)

df_melted.rename(columns={'geo\\TIME_PERIOD': 'geo'}, inplace=True)
df_melted.head()


Unnamed: 0,geo,year,value
0,DE,1975,:
1,EA19,1975,:
2,FR,1975,288049.5
3,DE,1976,:
4,EA19,1976,:


In [7]:
# Convert year to datetime
df_melted['year'] = pd.to_datetime(df_melted['year'], format='%Y')

# Clean value column: remove ':' and convert to numeric
df_melted['value'] = pd.to_numeric(df_melted['value'].str.replace(':', '', regex=False), errors='coerce')

# Drop missing values
df_clean = df_melted.dropna(subset=['value'])

df_clean.head()


Unnamed: 0,geo,year,value
2,FR,1975-01-01,288049.5
5,FR,1976-01-01,331012.5
8,FR,1977-01-01,355596.4
11,FR,1978-01-01,394484.3
14,FR,1979-01-01,443872.1


In [9]:
df_clean.to_csv("../data/cleaned_eurostat_gdp.csv", index=False)
print("✅ Saved cleaned GDP data to data/cleaned_eurostat_gdp.csv")


✅ Saved cleaned GDP data to data/cleaned_eurostat_gdp.csv
