# HICP MIDX - Extract Italy and Euro Area Data

This notebook extracts HICP (Harmonised Index of Consumer Prices) data for Italy and Euro Area from the compressed Eurostat file.


In [1]:
import pandas as pd
import os

# Check that the file exists
input_file = 'data/eurostat/prc_hicp_midx_linear.csv.gz'
output_file = 'data/eurostat/hicp_it_eu.csv'

if not os.path.exists(input_file):
    raise FileNotFoundError(f"File not found: {input_file}")

print(f"Input file: {input_file}")
print(f"Output file: {output_file}")


Input file: data/eurostat/prc_hicp_midx_linear.csv.gz
Output file: data/eurostat/hicp_it_eu.csv


In [None]:
# Define the countries to extract
target_countries = [
    'Italy',
    'Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)'
]

# Define the coicop filter
target_coicop = 'All-items HICP'

print(f"Countries to extract: {target_countries}")
print(f"COICOP filter: {target_coicop}")


Countries to extract: ['Italy', 'Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)']


In [None]:
# Read the file efficiently using chunks
# This avoids loading the entire file into memory
chunks = []
chunk_size = 100000  # Read 100k rows at a time

print("Reading file in chunks...")
for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
    # Filter by countries and coicop
    filtered_chunk = chunk[
        (chunk['geo'].isin(target_countries)) & 
        (chunk['coicop'] == target_coicop)
    ]
    if not filtered_chunk.empty:
        chunks.append(filtered_chunk)
    
    if (i + 1) % 10 == 0:
        print(f"Processed {i + 1} chunks...")

print(f"Total chunks processed: {i + 1}")


Reading file in chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 10 chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 20 chunks...
Processed 30 chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):
  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 40 chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 50 chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 60 chunks...


  for i, chunk in enumerate(pd.read_csv(input_file, compression='gzip', chunksize=chunk_size)):


Processed 70 chunks...
Total chunks processed: 77


In [9]:
# Combine all filtered chunks
if chunks:
    df_filtered = pd.concat(chunks, ignore_index=True)
    print(f"Extracted rows: {len(df_filtered)}")
    print(f"\nFirst extracted data:")
    print(df_filtered.head())
    print(f"\nDataset info:")
    print(df_filtered.info())
else:
    print("No data found for the specified countries!")


Extracted rows: 402909

First extracted data:
                   DATAFLOW        LAST UPDATE     freq             unit  \
0  ESTAT:PRC_HICP_MIDX(1.0)  19/11/25 11:00:00  Monthly  Index, 2005=100   
1  ESTAT:PRC_HICP_MIDX(1.0)  19/11/25 11:00:00  Monthly  Index, 2005=100   
2  ESTAT:PRC_HICP_MIDX(1.0)  19/11/25 11:00:00  Monthly  Index, 2005=100   
3  ESTAT:PRC_HICP_MIDX(1.0)  19/11/25 11:00:00  Monthly  Index, 2005=100   
4  ESTAT:PRC_HICP_MIDX(1.0)  19/11/25 11:00:00  Monthly  Index, 2005=100   

           coicop                                                geo  \
0  All-items HICP  Euro area (EA11-1999, EA12-2001, EA13-2007, EA...   
1  All-items HICP  Euro area (EA11-1999, EA12-2001, EA13-2007, EA...   
2  All-items HICP  Euro area (EA11-1999, EA12-2001, EA13-2007, EA...   
3  All-items HICP  Euro area (EA11-1999, EA12-2001, EA13-2007, EA...   
4  All-items HICP  Euro area (EA11-1999, EA12-2001, EA13-2007, EA...   

  TIME_PERIOD  OBS_VALUE OBS_FLAG CONF_STATUS  
0     1996-01   

In [10]:
# Verify the countries present in the extracted data
if chunks:
    print("Countries present in extracted data:")
    print(df_filtered['geo'].value_counts())


Countries present in extracted data:
geo
Italy                                                                                                            206694
Euro area (EA11-1999, EA12-2001, EA13-2007, EA15-2008, EA16-2009, EA17-2011, EA18-2014, EA19-2015, EA20-2023)    196215
Name: count, dtype: int64


In [11]:
# Save the filtered file
if chunks:
    # Create directory if it doesn't exist
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    # Save the CSV
    df_filtered.to_csv(output_file, index=False)
    print(f"\nFile saved successfully: {output_file}")
    print(f"Dimensions: {len(df_filtered)} rows, {len(df_filtered.columns)} columns")
else:
    print("No data to save!")



File saved successfully: data/eurostat/hicp_it_eu.csv
Dimensions: 402909 rows, 10 columns
