In [1]:
import pandas as pd
import numpy as np

In [20]:
df = pd.read_csv('https://github.com/armaf002/Food-price/raw/main/preprocessed-data.csv')
df.head(2)

Unnamed: 0,date,State,Local_Government,Market_Name,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,year,month,season,Inflation
0,2002-01-15,Katsina,Jibia,Jibia (CBM),13.08,7.24,cereals and tubers,Maize,1KG,actual,Wholesale,NGN,175.92,1.5525,2002,January,Dry Season,12.876579
1,2002-01-15,Katsina,Jibia,Jibia (CBM),13.08,7.24,cereals and tubers,Millet,1KG,actual,Wholesale,NGN,150.18,1.3254,2002,January,Dry Season,12.876579


# Dropping unnecessary columns

In [22]:
# convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
# drop the columns
df = df.drop(columns=['Market_Name','State','usdprice','currency','priceflag','Local_Government'])

In [32]:
commodity_lists = list(df['commodity'].unique())
unit_lists = list(df['unit'].unique())
unit_lists

['1KG', 'Unit', 'L', '30 pcs', '100 Tubers']

# Dictionaries to map the categorical features

In [34]:
season_dict = {'Dry Season':0,'Early Rainy Season':1,'Peak Rainy Season':2,'Late Rainy Season':3}
unit_dict = {'1KG':0, 'Unit':1, 'L':2, '30 pcs':3, '100 Tubers':4}
pricetype_dict = {'Wholesale':0,'Retail':1}
month_dict = {'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}
category_dict = {'cereals and tubers':0,'meat, fish and eggs':1,'milk and dairy':2,'miscellaneous food':3,'non-food':4,'oil and fats':5,'pulses and nuts':6,'vegetables and fruits':7}

In [38]:
# Create a mapping from food item to a numeric value
food_item_mapping = {commodity_list: index for index, commodity_list in enumerate(commodity_lists)}
# Now you can use this mapping to represent the food items
# For example, if you have a list of food items
selected_food_items = commodity_lists
# Convert them to numeric values
numeric_food_items = [food_item_mapping[item] for item in selected_food_items]

In [39]:
df['season_categorical'] = df['season'].map(season_dict)
df['unit_categorical'] = df['unit'].map(unit_dict)
df['pricetype_binary'] = df['pricetype'].map(pricetype_dict)
df['month_categorical'] = df['month'].map(month_dict)
df['categories_categorical'] = df['category'].map(category_dict)
df['commodities_categorical'] = df['commodity'].map(food_item_mapping)
df.drop(columns=['pricetype','season','unit','month','category','commodity'], inplace=True)

# Cyclic Encoding to use the **date** column in regression model

In [45]:
# Extract month as a numeric value (1 to 12)
df['month_numeric'] = df['date'].dt.month
# Apply cyclic encoding using sine and cosine functions
df['month_sin'] = np.sin(2 * np.pi * df['month_numeric'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month_numeric'] / 12)
# Drop the temporary 'month_numeric' column
df = df.drop(columns=['month_numeric'])

In [49]:
df.head()

Unnamed: 0,date,latitude,longitude,price,year,Inflation,season_categorical,unit_categorical,pricetype_binary,month_categorical,categories_categorical,commodities_categorical,month_sin,month_cos
0,2002-01-15,13.08,7.24,175.92,2002,12.876579,0,0,0,1,0,0,0.5,0.866025
1,2002-01-15,13.08,7.24,150.18,2002,12.876579,0,0,0,1,0,1,0.5,0.866025
2,2002-01-15,13.08,7.24,358.7,2002,12.876579,0,0,0,1,0,2,0.5,0.866025
3,2002-01-15,13.08,7.24,155.61,2002,12.876579,0,0,0,1,0,3,0.5,0.866025
4,2002-01-15,13.08,7.24,196.87,2002,12.876579,0,0,0,1,6,4,0.5,0.866025


# Checking the relations between final features
### The final dataset specifically for regression model.

In [53]:
# Calculate correlation matrix
correlation_matrix = df.corr()
correlation_matrix

  correlation_matrix = df.corr()


Unnamed: 0,latitude,longitude,price,year,Inflation,season_categorical,unit_categorical,pricetype_binary,month_categorical,categories_categorical,commodities_categorical,month_sin,month_cos
latitude,1.0,0.434396,-0.025368,-0.076376,-0.029596,-0.018203,-0.076177,0.155475,-0.017619,0.101385,0.192698,0.026011,-0.005002
longitude,0.434396,1.0,-0.004234,0.154937,0.061844,-0.017034,-0.080602,0.385682,-0.018748,0.203344,0.373686,0.031979,-0.010974
price,-0.025368,-0.004234,1.0,0.077559,0.082633,0.004734,0.406994,-0.094884,0.003952,-0.062828,0.031719,-0.005809,-0.010598
year,-0.076376,0.154937,0.077559,1.0,0.529896,-0.073122,0.026276,0.354021,-0.119672,0.131237,0.237999,0.109288,-0.026813
Inflation,-0.029596,0.061844,0.082633,0.529896,1.0,-0.079019,0.014888,0.144221,-0.108474,0.044267,0.077785,0.115244,-0.028186
season_categorical,-0.018203,-0.017034,0.004734,-0.073122,-0.079019,1.0,0.011336,-0.022601,0.577148,-0.007455,-0.013344,-0.736881,-0.227529
unit_categorical,-0.076177,-0.080602,0.406994,0.026276,0.014888,0.011336,1.0,0.098972,0.010562,0.01931,0.044205,-0.014669,0.001517
pricetype_binary,0.155475,0.385682,-0.094884,0.354021,0.144221,-0.022601,0.098972,1.0,-0.032457,0.274976,0.447667,0.035724,-0.014248
month_categorical,-0.017619,-0.018748,0.003952,-0.119672,-0.108474,0.577148,0.010562,-0.032457,1.0,-0.006334,-0.010174,-0.76843,0.194692
categories_categorical,0.101385,0.203344,-0.062828,0.131237,0.044267,-0.007455,0.01931,0.274976,-0.006334,1.0,0.666431,0.015312,-0.007684
