# Inflation - Consumer Price Index

In [5]:
import pandas as pd

In [6]:
df_cpi = pd.read_csv('../data/uncleaned/inflation_cpi.csv')
df_cpi.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,AUS,CPI,ENRG,AGRWTH,A,1972,4.91007,
1,AUS,CPI,ENRG,AGRWTH,A,1973,3.762801,
2,AUS,CPI,ENRG,AGRWTH,A,1974,13.17354,
3,AUS,CPI,ENRG,AGRWTH,A,1975,19.42247,
4,AUS,CPI,ENRG,AGRWTH,A,1976,8.833195,


In [7]:
print(df_cpi.shape)

(294281, 8)


## Null Value Analysis

In [8]:
print("Analysis of Null values:")
df_cpi.info()

Analysis of Null values:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294281 entries, 0 to 294280
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   LOCATION    294281 non-null  object 
 1   INDICATOR   294281 non-null  object 
 2   SUBJECT     294281 non-null  object 
 3   MEASURE     294281 non-null  object 
 4   FREQUENCY   294281 non-null  object 
 5   TIME        294281 non-null  object 
 6   Value       294281 non-null  float64
 7   Flag Codes  72 non-null      object 
dtypes: float64(1), object(7)
memory usage: 18.0+ MB


### Hence we see that there are only 25 non-null value in Flag Codes, hence we dropp it.

In [9]:
df_cpi = df_cpi.drop(columns=['Flag Codes'])
df_cpi.head()

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value
0,AUS,CPI,ENRG,AGRWTH,A,1972,4.91007
1,AUS,CPI,ENRG,AGRWTH,A,1973,3.762801
2,AUS,CPI,ENRG,AGRWTH,A,1974,13.17354
3,AUS,CPI,ENRG,AGRWTH,A,1975,19.42247
4,AUS,CPI,ENRG,AGRWTH,A,1976,8.833195


## Unique Value Analysis

In [10]:
df_cpi.nunique()

LOCATION         52
INDICATOR         1
SUBJECT           4
MEASURE           2
FREQUENCY         3
TIME           1855
Value        247327
dtype: int64

### Here we see that INDICATOR, SUBJECT, MEASURE consists of only a single value within them, hence we can drp those columns.

In [7]:
df_cpi = df_cpi.drop(columns=["INDICATOR","MEASURE"])
df_cpi.head()

Unnamed: 0,LOCATION,FREQUENCY,TIME,Value
0,AUS,A,1972,4.91007
1,AUS,A,1973,3.762801
2,AUS,A,1974,13.17354
3,AUS,A,1975,19.42247
4,AUS,A,1976,8.833195


In [8]:
df_cpi.shape

(294005, 4)

## Datatype Reset

In [9]:
df_cpi['LOCATION']=df_cpi['LOCATION'].astype('string')
df_cpi['TIME']=df_cpi['TIME'].astype('string')

In [10]:
df_cpi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 294005 entries, 0 to 294004
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   LOCATION   294005 non-null  string 
 1   FREQUENCY  294005 non-null  object 
 2   TIME       294005 non-null  string 
 3   Value      294005 non-null  float64
dtypes: float64(1), object(1), string(2)
memory usage: 9.0+ MB


# 

## SUBJECT ANALYSIS

In [11]:
df_cpi['SUBJECT'].unique()

array(['ENRG', 'FOOD', 'TOT', 'TOT_FOODENRG'], dtype=object)

## FREQUENCY Analysis

In [11]:
df_cpi['FREQUENCY'].unique()

array(['A', 'Q', 'M'], dtype=object)

We need to analyse the FREQUENCY column to see if all countries have data consistent in the annual "A" format. We first need to set a range of which we will check the dates for

In [12]:
start_year = int(min(df_cpi["TIME"]))
end_year = int(max(df_cpi.loc[df_cpi["FREQUENCY"] == "A"]["TIME"]))
print("Start Year",start_year)
print("End Year",end_year)


Start Year 1914
End Year 2022


Now we will set the start year to 2001 since we want roughly 15 years worth data.

In [13]:
start_year = 2001
countries = []
print("Missing Values:\n")
for c in df_cpi["LOCATION"].unique():
    temp = df_cpi.loc[df_cpi["LOCATION"] == c]
    temp = temp[temp["FREQUENCY"] == "A"]["TIME"]
    for i in range(start_year,end_year+1):
        if(str(i) not in temp.values):
            print("Location:",c,"\tYear:",i)
            if c not in countries:
                countries.append(c)
print("\nCountries with missing values between 2001 and 2022:",len(countries))

Missing Values:

Location: RUS 	Year: 2022
Location: ARG 	Year: 2001
Location: ARG 	Year: 2002
Location: ARG 	Year: 2003
Location: ARG 	Year: 2004
Location: ARG 	Year: 2005
Location: ARG 	Year: 2006
Location: ARG 	Year: 2007
Location: ARG 	Year: 2008
Location: ARG 	Year: 2009
Location: ARG 	Year: 2010
Location: ARG 	Year: 2011
Location: ARG 	Year: 2012
Location: ARG 	Year: 2013
Location: ARG 	Year: 2014
Location: ARG 	Year: 2015
Location: ARG 	Year: 2016
Location: ARG 	Year: 2017
Location: CRI 	Year: 2022

Countries with missing values between 2001 and 2022: 3


## Hence since the country has all the data from 