# Inflation dataset

Overvice about inflation

## Install packages

In [4]:
pip install pandas numpy matplotlib seaborn jupyterlab openpyxl

You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/usr/local/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Data collection

In [1]:
import pandas as pd
from pathlib import Path

csvPath = Path('global_inflation_data.csv')
outPath = Path('inflation_clean.csv')  # cleaned wide-format
preOutPath = Path('inflation_pre.csv')  # Year/Country/Avg.Inflation
print(f'csvPath={csvPath}, outPath={outPath}, preOutPath={preOutPath}')

csvPath=global_inflation_data.csv, outPath=inflation_clean.csv, preOutPath=inflation_pre.csv


## Data cleaning

In [2]:
# Data Cleaning: keep only requested countries and year columns 2007-2023
df = pd.read_csv(csvPath)
countriesToKeep = ['Afghanistan','Niger','Iraq','Cameroon','Burkina Faso','Burundi','Central African Republic','Haiti','Lebanon','Mozambique','Myanmar','Guinea-Bissau','Nigeria','Chad','Mali','Liberia','Sudan']
def find_col(df, keys):
    cols = list(df.columns)
    for c in cols:
        low = str(c).lower()
        for k in keys:
            if k in low:
                return c
    return None
countryCol = find_col(df, ['country','location'])
if countryCol is None:
    raise SystemExit('Country column not found')
df[countryCol] = df[countryCol].astype(str).str.strip()
df = df[df[countryCol].isin(countriesToKeep)]
yearCols = [c for c in df.columns if str(c).strip().isdigit() and 2007 <= int(str(c).strip()) <= 2023]
if not yearCols:
    # try pivot from long format if Year & a numeric value column exist
    yearCol = find_col(df, ['year'])
    numeric = df.select_dtypes(include=['number']).columns.tolist()
    valCol = numeric[0] if numeric else None
    if yearCol and valCol:
        df_pivot = df[df[yearCol].astype(str).str.extract(r'(20\d{2})')[0].isin([str(y) for y in range(2007,2024)])][[countryCol, yearCol, valCol]]
        df_wide = df_pivot.pivot_table(index=countryCol, columns=yearCol, values=valCol, aggfunc='first').reset_index()
        # ensure string column names for years
        df_wide.columns = [str(c) for c in df_wide.columns]
        df_wide = df_wide.rename(columns={countryCol: 'Country'})
        df_wide.to_csv(outPath, index=False)
        print(f'Saved cleaned (pivoted) to {outPath} (shape={df_wide.shape})')
    else:
        raise SystemExit('No year columns found and cannot pivot to wide format')
else:
    df_clean = df[[countryCol] + yearCols].copy()
    df_clean = df_clean.rename(columns={countryCol: 'Country'})
    # keep years as strings '2007'..'2023' and drop rows where all years are NaN
    df_clean.columns = [str(c) for c in df_clean.columns]
    keepYears = [y for y in df_clean.columns if y.isdigit() and 2007 <= int(y) <= 2023]
    df_clean = df_clean.dropna(subset=keepYears, how='all')
    df_clean.to_csv(outPath, index=False)
    print(f'Saved cleaned wide to {outPath} (shape={df_clean.shape})')

Saved cleaned wide to inflation_clean.csv (shape=(17, 18))


## Data prepocessing

In [3]:
# Preprocessing: produce Year/Country/Avg.Inflation (2007-2023) sorted Country A->Z then Year
dfw = pd.read_csv(outPath)
yearCols = [c for c in dfw.columns if str(c).strip().isdigit() and 2007 <= int(str(c).strip()) <= 2023]
if yearCols:
    df_long = dfw.melt(id_vars=['Country'], value_vars=yearCols, var_name='Year', value_name='Avg.Inflation')
    df_long['Year'] = df_long['Year'].astype(str).str.extract(r'(20\d{2})')[0].astype(int)
else:
    df_long = dfw.copy()
    for c in df_long.columns:
        if c.lower() == 'year' and 'Year' not in df_long.columns:
            df_long = df_long.rename(columns={c: 'Year'})
            break
    if 'Avg.Inflation' not in df_long.columns:
        nums = df_long.select_dtypes(include=['number']).columns.tolist()
        cand = [c for c in nums if str(c).lower() != 'year']
        if cand:
            df_long = df_long.rename(columns={cand[0]: 'Avg.Inflation'})
df_long = df_long.dropna(subset=['Country','Year'])
df_long = df_long[df_long['Year'].between(2007,2023)]
df_final = df_long[['Year','Country','Avg.Inflation']].copy()
df_final['Avg.Inflation'] = pd.to_numeric(df_final['Avg.Inflation'], errors='coerce')
df_final = df_final.sort_values(['Country','Year']).reset_index(drop=True)
df_final.to_csv(preOutPath, index=False)
print(f'Saved preprocessed to {preOutPath} (shape={df_final.shape})')
df_final.head(60)

Saved preprocessed to inflation_pre.csv (shape=(289, 3))


Unnamed: 0,Year,Country,Avg.Inflation
0,2007,Afghanistan,8.68
1,2008,Afghanistan,26.42
2,2009,Afghanistan,-6.81
3,2010,Afghanistan,2.18
4,2011,Afghanistan,11.8
5,2012,Afghanistan,6.44
6,2013,Afghanistan,7.39
7,2014,Afghanistan,4.67
8,2015,Afghanistan,-0.66
9,2016,Afghanistan,4.38
