In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

cpi = 'CPI.csv' # Consumer price index
eci = 'ECI.csv' #  Employment cost index
csi = 'CONSUMER-SENTIMENT-INDEX.csv' # Consumer sentiment index
gpd = 'GPD-PRICE-DEFLATOR.csv' # Price deflator
ipd = 'IMPORT-PRICE-INDEX.csv' # Import price index
op = 'OIL-PRICE.csv' # WPI oil prices
pce = 'PCE.csv' # Personal consumption expenditures
ppi = 'PPI.csv' # Producer price index
wg = 'WAGE-GROWTH.csv' # Wage growth
unrate = 'UNRATE.csv' # Unemploment rate

df_cpi = pd.read_csv(cpi, index_col='DATE', parse_dates=True)
df_eci = pd.read_csv(eci, index_col='DATE', parse_dates=True)
df_csi = pd.read_csv(csi)
df_gpd = pd.read_csv(ipd, index_col='DATE', parse_dates=True)
df_op = pd.read_csv(op, index_col='DATE', parse_dates=True)
df_pce = pd.read_csv(pce, index_col='DATE', parse_dates=True)
df_ppi = pd.read_csv(ppi, index_col='DATE', parse_dates=True)
df_wg = pd.read_csv(wg, index_col='DATE', parse_dates=True)
df_unemply = pd.read_csv(unrate, index_col='DATE', parse_dates=True)

def check_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    return df[(df[col] > Q3 + 1.5 * IQR) | (df[col] < Q1 - 1.5 * (IQR))]

dfs = [df_eci, df_gpd, df_op, df_pce, df_ppi, df_wg, df_unemply, df_csi, df_cpi]

for df in dfs:
    for column in df.columns:
        df[column].replace('.', np.nan, inplace=True)

for df in dfs:
    for column in df.columns:
        print(column.upper())
        print(df[column].isnull().sum())
        print('*' * 130)
        print()

df_gpd.dropna(inplace=True)
df_wg.dropna(inplace=True)
df_csi.drop('Table 1: The Index of Consumer Sentiment', axis = 1, inplace=True)
df_csi
df_csi.reset_index(inplace=True)
df_csi

# Combine the columns into a single datetime column
df_csi['Date'] = pd.to_datetime(df_csi['level_1'] + '-' + df_csi['level_0'] + '-01', errors='coerce')
df_csi.dropna(inplace=True)
df_csi.drop(['level_0', 'level_1'], axis = 1, inplace=True)
df_csi.columns = ['csi', 'DATE']
df_csi = df_csi[['DATE', 'csi']]
df_csi.set_index(df_csi['DATE'], inplace=True)
df = df_eci.join(df_gpd).join(df_op).join(df_pce).join(df_ppi).join(df_unemply).join(df_wg).join(df_csi).join(df_cpi)
df.drop('DATE', axis=1, inplace=True)
df.columns = ['eci', 'price_deflator', 'oil_price', 'pce', 'ppi', 'unemployment_rate', 'wage_growth', 'csi', 'cpi']

df.head()
df.dtypes

for column in df.columns:
    if df[column].dtype == 'object':
        df[column] = df[column].astype('float64')

for column in df.columns:
    print(column.upper())
    print(check_outliers(df, column)[column])
    print(len(check_outliers(df, column)))
    print('*' * 150)
    print()

check_outliers(df, 'unemployment_rate')
check_outliers(df, 'ppi')
check_outliers(df, 'wage_growth')

for column in df.columns:
    plt.figure(figsize=(16, 8))
    sns.histplot(df[column])
    plt.plot()