In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [None]:
df = pd.read_csv('https://github.com/armaf002/Food-price/raw/main/preprocessed-data.csv')
df.sample(5)

Unnamed: 0,date,State,Local_Government,Market_Name,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,year,month,season,Inflation
62118,2022-09-15,Borno,Maiduguri,Maiduguri,11.8311,13.15097,non-food,Fuel (diesel),L,aggregate,Retail,NGN,800.0,1.87,2022,September,Peak Rainy Season,18.847188
38274,2020-08-15,Yobe,Geidam,Geidam,12.890278,11.921166,oil and fats,Oil (vegetable),1KG,actual,Retail,NGN,533.32,1.0499,2020,August,Peak Rainy Season,13.246023
880,2009-01-15,Jigawa,Maigatari,Mai Gatari (CBM),12.78,9.44,cereals and tubers,Maize,1KG,actual,Wholesale,NGN,144.05,0.9711,2009,January,Dry Season,12.537828
17418,2017-11-15,Yobe,Geidam,Geidam,12.890278,11.921166,cereals and tubers,Millet,1KG,actual,Retail,NGN,21.428571,0.8427,2017,November,Late Rainy Season,16.502266
7342,2016-02-15,Zamfara,Kaura Namoda,Kaura Namoda,12.59519,6.58635,cereals and tubers,Gari (white),1KG,actual,Wholesale,NGN,58.4,29.6492,2016,February,Dry Season,15.696813


In [None]:
# Extract day of the week from the 'date' column
df['day_of_week'] = pd.to_datetime(df['date']).dt.day_name()
df.sample(2)

Unnamed: 0,date,State,Local_Government,Market_Name,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,year,month,season,Inflation,day_of_week
51505,2020-06-15,Kebbi,Gwandu,Gwandu,12.49382,4.64227,cereals and tubers,Rice (imported),1KG,aggregate,Wholesale,NGN,435.5652,56.2746,2020,June,Peak Rainy Season,13.246023,Monday
8396,2016-06-15,Kano,Dawakin Tofa,Dawanau,12.09012,8.42912,cereals and tubers,Millet,1KG,actual,Wholesale,NGN,102.4,51.4832,2016,June,Peak Rainy Season,15.696813,Wednesday


In [None]:
df['season'].unique()

array(['Dry Season', 'Early Rainy Season', 'Peak Rainy Season',
       'Late Rainy Season'], dtype=object)

In [None]:
df['pricetype'].unique()

array(['Wholesale', 'Retail'], dtype=object)

In [None]:
df['priceflag'].unique()

array(['actual', 'actual,aggregate', 'aggregate'], dtype=object)

1. **'actual':**
This value indicates that the price recorded in the DataFrame is an actual or specific price for the given commodity at the specified time and location. In other words, it represents the real, observed price of the commodity.

2. **'actual,aggregate':**
When 'priceflag' is 'actual,aggregate,' it means that the reported price is both an actual price and is also part of an aggregate or summary value. This might occur when prices are reported at both the individual item level and as part of a larger aggregated group. For example, the price for an individual product may be reported alongside the price for a category of products.

3. **'aggregate':**
The 'aggregate' value suggests that the price is part of an aggregated or summary value. It does not represent an actual, specific price for a single commodity but is instead a price calculated or reported for a group or category of items. This is typically a weighted average or summary price for a category of products.

In [None]:
df['unit'].unique()

array(['1KG', 'Unit', 'L', '30 pcs', '100 Tubers'], dtype=object)

In [None]:
# Filter the DataFrame to select rows with 'unit' as '30 pcs'
filtered_data = df[df['unit'] == '30 pcs']

# Extract and display unique 'commodity' values for these filtered rows
unique_commodities = filtered_data['commodity'].unique()
unique_commodities

array(['Eggs'], dtype=object)

In [None]:
# Replace '30 pcs' with '1 crate' in the 'unit' column for 'Eggs' commodity
df.loc[df['commodity'] == 'Eggs', 'unit'] = '1 crate'

Replacing '30 pcs' with '1 crate' in the 'unit' column for the 'Eggs' commodity likely serves the purpose of standardizing and improving the consistency of the dataset. Standardization ensures that the data aligns with industry norms and practices, allowing for accurate analysis and modeling.

In [None]:
df.unit.unique()

array(['1KG', 'Unit', 'L', '1 crate', '100 Tubers'], dtype=object)

In [None]:
# Filter the DataFrame to select rows with 'unit' as '30 pcs'
filtered_data = df[df['unit'] == '100 Tubers']

# Extract and display unique 'commodity' values for these filtered rows
unique_commodities = filtered_data['pricetype'].unique()
unique_commodities

array(['Wholesale'], dtype=object)

Since the prices for yams are in wholesale and the standard unit is '100 tubers,' it's definitely appropriate to leave it as is. Wholesale prices are typically quoted in larger quantities because businesses and buyers at the wholesale level deal with larger volumes of goods.

In [None]:
df['category'].unique()

array(['cereals and tubers', 'pulses and nuts', 'non-food',
       'oil and fats', 'meat, fish and eggs', 'milk and dairy',
       'vegetables and fruits', 'miscellaneous food'], dtype=object)

In [None]:
# Remove rows with 'non-food' category to focus on food prices
df = df[df['category'] != 'non-food']
df['category'].unique()

array(['cereals and tubers', 'pulses and nuts', 'oil and fats',
       'meat, fish and eggs', 'milk and dairy', 'vegetables and fruits',
       'miscellaneous food'], dtype=object)

 The 'non-food' category has been removed from the dataset to narrow the focus exclusively on food prices, aligning with the specific analysis or modeling goals related to food commodities.

In [None]:
# Dropping unnecessary columns for food price prediction in Nigeria
columns_to_drop = ['latitude', 'longitude', 'Market_Name', 'usdprice', 'currency', 'date']
df = df.drop(columns=columns_to_drop, axis=1)

In [None]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to all categorical columns in place
categorical_columns = ['State', 'Local_Government', 'category', 'commodity', 'unit', 'priceflag', 'pricetype', 'season']

for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Define the order of months
month_order = [
    'January', 'February', 'March', 'April', 'May', 'June',
    'July', 'August', 'September', 'October', 'November', 'December'
]

# Perform Ordinal Encoding for 'month' starting from 1
df['month'] = df['month'].apply(lambda x: month_order.index(x) + 1)


# Display the result
df.sample(3)

Unnamed: 0,State,Local_Government,category,commodity,unit,priceflag,pricetype,price,year,month,season,Inflation,day_of_week
28532,13,18,0,26,2,0,1,262.2,2019,3,1,11.396422,Friday
51311,2,23,0,5,2,2,1,209.3333,2020,6,3,13.246023,Monday
7577,12,3,0,15,2,0,1,80.0,2016,3,1,15.696813,Tuesday


**For predicting food prices in Nigeria, where traditional measurement methods without standard scales are common, it is generally better to categorize the units while preserving the original units of measurement. This approach respects local practices and maintains interpretability, considering the inherent differences in how various food items are traditionally bought and sold. It provides a more accurate reflection of the real-world market and ensures that predictions align with local customs and user expectations.**

In [None]:
df.columns

Index(['State', 'Local_Government', 'category', 'commodity', 'unit',
       'priceflag', 'pricetype', 'price', 'year', 'month', 'season',
       'Inflation', 'day_of_week'],
      dtype='object')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61466 entries, 0 to 64220
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   State             61466 non-null  int64  
 1   Local_Government  61466 non-null  int64  
 2   category          61466 non-null  int64  
 3   commodity         61466 non-null  int64  
 4   unit              61466 non-null  int64  
 5   priceflag         61466 non-null  int64  
 6   pricetype         61466 non-null  int64  
 7   price             61466 non-null  float64
 8   year              61466 non-null  int64  
 9   month             61466 non-null  int64  
 10  season            61466 non-null  int64  
 11  Inflation         61466 non-null  float64
 12  day_of_week       61466 non-null  object 
dtypes: float64(2), int64(10), object(1)
memory usage: 6.6+ MB



In the dataset, the majority of the features are categorical, and there are two continuous variables, 'price' and 'Inflation.' Due to the risk of data leakage, 'price' cannot be utilized in feature engineering. Therefore, the focus of feature engineering has been on appropriate encoding of categorical variables and necessary preprocessing steps. No new features was formed except the ones formed from the earlier preprocessing done before.