### Imports

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('ggplot')

### Quick EDA

In [None]:
df = pd.read_csv('./consumer_price_index.csv')
df

In [None]:
df.columns

In [None]:
df.isna().sum()

### Feature Engineering

In [None]:
# Removing incosistencies
df.replace('November ', 'November', inplace=True)
df.replace('Marcrh', 'March', inplace=True)

In [None]:
ru_df = df.query('Sector == "Rural+Urban"')
u_df = df.query('Sector == "Urban"')
r_df = df.query('Sector == "Rural"')

In [None]:
# Missing due to Covid-19
u_df[u_df.isna().any(axis=1)]

In [None]:
# Missing due to Covid-19
ru_df[ru_df.isna().any(axis=1)]

In [None]:
r_df[r_df.isna().any(axis=1)]

In [None]:
r_df.drop('Housing', inplace=True,axis=1)

In [None]:
ru_df = ru_df.dropna()
u_df = u_df.dropna()
r_df = r_df.dropna()
ru_df['Housing'] = ru_df['Housing'].astype(float)
u_df['Housing'] = u_df['Housing'].astype(float)

### Plotting General CPI from 2013 - 2023

In [None]:
# 2019 April data missing 
ru_df.query("Year == 2019")

# 2021 April and May data missing
ru_df.query("Year == 2021")

all_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July',
              'August', 'September', 'October', 'November',
              'December']

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=3, figsize=(18, 16))

for i in range(2013, 2024):
    general_cpi = ru_df.query("Year == @i and Sector == 'Rural+Urban'")['General index']
    if i == 2019:
        general_cpi.loc[3] = np.nan
    if i == 2020:
        general_cpi.loc[3] = np.nan
        general_cpi.loc[4] = np.nan
    if i == 2023:
        for j in range(3, 13):  # Use a different variable name for the inner loop
            general_cpi.loc[j] = np.nan
    ax = axes[(i-2013) // 3, (i-2013) % 3]  # Select the appropriate subplot
    if general_cpi.notna().any():
        ax.plot(all_months, np.where(pd.isna(general_cpi), np.nan, general_cpi), marker='o')
        ax.set_title(f"General CPI over {i}")
        ax.set_xlabel('Months')
        ax.set_ylabel('General CPI')
        ax.set_xticks(range(len(all_months)))  # Set the x-axis tick positions
        ax.set_xticklabels(all_months, rotation='vertical', ha='center')  # Set the x-axis tick labels
    else: 
        ax.axis('off')
plt.tight_layout()
plt.show()

### Dataframes used to build the model : Rural+Urban & Urban

In [None]:
model_df = pd.concat([u_df, ru_df])
model_df.shape

In [None]:
model_df

### Feature Transformation

#### Year and Month

##### Periodicity: 
Determine whether the Year-Month combination exhibits periodic patterns. You can create binary features indicating whether a specific month falls within a particular season or whether it belongs to a specific quarter. This can help capture cyclical behavior in your data.

In [None]:
model_df['Year'] = pd.to_datetime(model_df['Year'], format='%Y')
model_df['Month'] = pd.to_datetime(model_df['Month'], format='%B')

seasons = {
    1: 'Winter',
    2: 'Winter',
    3: 'Spring',
    4: 'Spring',
    5: 'Spring',
    6: 'Summer',
    7: 'Summer',
    8: 'Summer',
    9: 'Autumn',
    10: 'Autumn',
    11: 'Autumn',
    12: 'Winter'
}

model_df['Season'] = model_df['Month'].map(seasons)

quarters = {
    1: 'Q1',
    2: 'Q1',
    3: 'Q1',
    4: 'Q2',
    5: 'Q2',
    6: 'Q2',
    7: 'Q3',
    8: 'Q3',
    9: 'Q3',
    10: 'Q4',
    11: 'Q4',
    12: 'Q4'
}

model_df['Quarter'] = model_df['Month'].map(quarters)
model_df = pd.get_dummies(model_df, columns=['Season', 'Quarter'])

#### Sector

##### Binary Encoding 
The binary encoding approach converts the "Sector" variable into a single binary column, where "Urban" is represented as 1 and "Rural+Urban" is represented as 0.



In [None]:
model_df['Sector_Binary'] = model_df['Sector'].map({'Urban': 1, 'Rural+Urban': 0})


In [None]:
model_df['Housing'] = model_df['Housing'].astype(float)
model_df['Sector_Binary'] = model_df['Sector_Binary'].astype(float)

### Plotting CPI Distribution for various commodities

In [None]:
non_cpi_cols = {'Sector', 'Sector_Binary', 'Year', 'Month'}
cpi_cols = [col for col in model_df.columns if col not in non_cpi_cols]
print(len(cpi_cols))
for i in cpi_cols:
    sns.kdeplot(data=model_df[i])
    plt.xlabel('CPI')
    plt.ylabel('Density')
    plt.title(f'CPI Distribution - {i}')
    plt.show()

### Building ML Model

In [None]:
model_df = model_df.drop('Sector',axis=1)
model_df

In [None]:
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
import joblib
numeric_cols = model_df.select_dtypes(include='number').columns
y = model_df['General index']
X = model_df[numeric_cols].drop('General index', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = RobustScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

#### LightGBM 

In [None]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

lgb_regressor = lgb.LGBMRegressor()
lgb_regressor.fit(X_train_scaled, y_train)
y_pred = lgb_regressor.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

In [None]:
joblib.dump(lgb_regressor, '(3)lgb_regressor.pkl')

#### Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rf_regressor = RandomForestRegressor()
rf_regressor.fit(X_train_scaled, y_train)
y_pred = rf_regressor.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

In [None]:
joblib.dump(rf_regressor, '(2)rf_regressor.pkl')

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Assuming you have already split your data into X_train, X_test, y_train, y_test

# Create a Gradient Boosting Regressor object
gb_regressor = GradientBoostingRegressor()

# Fit the model on the training data
gb_regressor.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = gb_regressor.predict(X_test_scaled)

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)



In [None]:
joblib.dump(gb_regressor, '(1)gb_regressor.pkl')

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Assuming you have already split your data into X_train, X_test, y_train, y_test

# Create a Decision Tree Regressor object
dt_regressor = DecisionTreeRegressor()

# Fit the model on the training data
dt_regressor.fit(X_train_scaled, y_train)

# Make predictions on the test data
y_pred = dt_regressor.predict(X_test_scaled)

# Evaluate the model using mean squared error (MSE)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

In [None]:
joblib.dump(dt_regressor, '(4)dt_regressor.pkl')

### Pickle files are numbered based on their accuracy