# Food Price Data Source

[WFP Food Prices Kenya Dataset](https://data.humdata.org/dataset/e0d3fba6-f9a2-45d7-b949-140c455197ff/resource/517ee1bf-2437-4f8c-aa1b-cb9925b9d437/download/wfp_food_prices_ken.csv)

# -------------------------------------------------------------------------------------------






# DATA CLEANING

In [2]:
!pip install pyspark



In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('FoodPricePrediction').master('local[*]').getOrCreate()

spark.sparkContext.appName

'FoodPricePrediction'

In [4]:
data = spark.read.csv("drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/wfp_food_prices_ken.csv",inferSchema=True,header=True)
data.printSchema()

root
 |-- date: string (nullable = true)
 |-- admin1: string (nullable = true)
 |-- admin2: string (nullable = true)
 |-- market: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- category: string (nullable = true)
 |-- commodity: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- priceflag: string (nullable = true)
 |-- pricetype: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: string (nullable = true)
 |-- usdprice: string (nullable = true)



In [5]:
data.show(5)

+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|      date|    admin1|    admin2|          market|latitude|longitude|          category|    commodity|      unit|       priceflag|       pricetype| currency| price|  usdprice|
+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|     #date|#adm1+name|#adm2+name|#loc+market+name|#geo+lat| #geo+lon|        #item+type|   #item+name|#item+unit|#item+price+flag|#item+price+type|#currency|#value|#value+usd|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|        Maize|        KG|          actual|       Wholesale|      KES| 16.13|    0.2235|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|Maize (white)|     90 KG| 

In [6]:
# Rename columns for clarity
data1 = data.withColumnRenamed("admin1", "region").withColumnRenamed("admin2", "county")

In [7]:
data1.show(2)

+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+----------------+---------+------+----------+
|      date|    region|    county|          market|latitude|longitude|          category| commodity|      unit|       priceflag|       pricetype| currency| price|  usdprice|
+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+----------------+---------+------+----------+
|     #date|#adm1+name|#adm2+name|#loc+market+name|#geo+lat| #geo+lon|        #item+type|#item+name|#item+unit|#item+price+flag|#item+price+type|#currency|#value|#value+usd|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|     Maize|        KG|          actual|       Wholesale|      KES| 16.13|    0.2235|
+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+--

In [8]:
from pyspark.sql.functions import col, to_date
from pyspark.sql.types import FloatType, DoubleType

# Convert data types — make sure to always use 'data2' in the chain
data2 = data1.withColumn("date", to_date(col("date"), "yyyy-MM-dd")) \
             .withColumn("latitude", col("latitude").cast(DoubleType())) \
             .withColumn("longitude", col("longitude").cast(DoubleType())) \
             .withColumn("price", col("price").cast(FloatType())) \
             .withColumn("usdprice", col("usdprice").cast(FloatType()))

In [9]:
data2.printSchema()

root
 |-- date: date (nullable = true)
 |-- region: string (nullable = true)
 |-- county: string (nullable = true)
 |-- market: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- category: string (nullable = true)
 |-- commodity: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- priceflag: string (nullable = true)
 |-- pricetype: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: float (nullable = true)
 |-- usdprice: float (nullable = true)



In [10]:
# Check initial data size
data2.count(), len(data2.columns)

(12865, 14)

In [11]:
from pyspark.sql.functions import col, sum

# Count nulls in each column
null_counts = data2.select([sum(col(c).isNull().cast("int")).alias(c) for c in data2.columns])
null_counts.show()

+----+------+------+------+--------+---------+--------+---------+----+---------+---------+--------+-----+--------+
|date|region|county|market|latitude|longitude|category|commodity|unit|priceflag|pricetype|currency|price|usdprice|
+----+------+------+------+--------+---------+--------+---------+----+---------+---------+--------+-----+--------+
|   1|    40|    40|     0|      41|       41|       0|        0|   0|        0|        0|       0|    1|       1|
+----+------+------+------+--------+---------+--------+---------+----+---------+---------+--------+-----+--------+



In [12]:
# Drop rows with null values
data_clean = data2.dropna()

In [13]:
# Check data size after droping nulls
data_clean.count(), len(data_clean.columns)

(12824, 14)

In [14]:
# Save the cleaned and preprocessed data to CSV
output_path = "drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/cleaned_data.csv"
data_clean.write.csv(output_path, header=True)

# MODELLING

In [15]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

In [16]:
path = "drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/cleaned_data.csv"
food_data = spark.read.csv(path, inferSchema=True, header=True)

In [17]:
feature_cols = ['region', 'county', 'market', 'category', 'commodity', 'unit']

In [18]:
for col in feature_cols:
    unique_values = food_data.select(col).distinct().rdd.map(lambda row: row[0]).collect()
    print(f"Unique values in '{col}':")
    print(unique_values)
    print('-' * 50)

Unique values in 'region':
['Rift Valley', 'Eastern', 'North Eastern', 'Nyanza', 'Coast', 'Central', 'Nairobi']
--------------------------------------------------
Unique values in 'county':
['Uasin Gishu', 'Nakuru', 'Mandera', 'Kisumu', 'Marsabit', 'Wajir', 'Kajiado', 'Turkana', 'Mombasa', 'Kwale', 'Makueni', 'Meru South', 'Garissa', 'Nairobi', 'Isiolo', 'Kitui', 'Kilifi', 'Baringo', 'West Pokot', 'Nyeri', 'Machakos']
--------------------------------------------------
Unique values in 'market':
['Kaanwa (Tharaka Nithi)', 'Illbissil Food Market (Kajiado)', 'IFO (Daadab)', 'Kibra (Nairobi)', 'Dadaab town', 'Kalahari (Mombasa)', 'Mathare (Nairobi)', 'Kitengela (Kajiado)', 'Kakuma 3', 'Mukuru (Nairobi)', 'Shonda (Mombasa)', 'Nakuru', 'Lodwar town', 'Moroto (Mombasa)', 'Kalobeyei (Village 3)', 'Wote town (Makueni)', 'Dandora (Nairobi)', 'Kakuma 2', 'Kakuma 4', 'Wakulima (Nakuru)', 'Vanga (Kwale)', 'Takaba (Mandera)', 'Wajir town', 'Dagahaley (Daadab)', 'Mandera', 'Kisumu', 'Lomut (West Poko

In [19]:
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in feature_cols]

In [20]:
indexer_pipeline = Pipeline(stages=indexers)

In [21]:
assembler = VectorAssembler(inputCols=[f"{col}_index" for col in feature_cols] + ['latitude', 'longitude'],
                            outputCol='features', handleInvalid='skip')

In [22]:
models = {
    'RandomForest': RandomForestRegressor(featuresCol='features', labelCol='usdprice', maxBins=100, numTrees=50),
    'GradientBoostedTree': GBTRegressor(featuresCol='features', labelCol='usdprice', maxBins=100),
    'LinearRegression': LinearRegression(featuresCol='features', labelCol='usdprice')
}

In [23]:
train_data, test_data = food_data.randomSplit([0.8, 0.2], seed=42)

In [24]:
best_model = None
best_rmse = float('inf')
best_name = ""

# Evaluate models
for name, model in models.items():
    pipeline = Pipeline(stages=indexers + [assembler, model])

    paramGrid = ParamGridBuilder().build()
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=RegressionEvaluator(labelCol='usdprice', predictionCol='prediction', metricName='rmse'),
                              numFolds=5)

    try:
        cv_model = crossval.fit(train_data)
        predictions = cv_model.transform(test_data)
        rmse = RegressionEvaluator(labelCol='usdprice', predictionCol='prediction', metricName='rmse').evaluate(predictions)

        print(f"{name} RMSE: {rmse}")

        if rmse < best_rmse:
            best_rmse = rmse
            best_model = cv_model.bestModel
            best_name = name

    except Exception as e:
        print(f"Failed to train {name} model: {e}")

print(f"Best model: {best_name} with RMSE: {best_rmse}")

RandomForest RMSE: 8.066859909499252
GradientBoostedTree RMSE: 7.56114182474984
LinearRegression RMSE: 23.49689807137832
Best model: GradientBoostedTree with RMSE: 7.56114182474984


In [25]:
best_gbt_model = cv_model.bestModel


model_path = "drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/models/gbt_price_prediction_model"
best_gbt_model.save(model_path)

print(f"Model saved to {model_path}")

Model saved to drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/models/gbt_price_prediction_model


In [26]:
from pyspark.sql import Row

In [27]:
sample_data = [
    Row(region="Coast", county="Mombasa", market="Mombasa", category="cereals and tubers", commodity="Maize", unit="KG", latitude=-4.05, longitude=39.666667)
]

In [28]:
sample_df = spark.createDataFrame(sample_data)

In [29]:
indexers = [StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep') for col in ['region', 'county', 'market', 'category', 'commodity', 'unit']]

In [30]:
assembler = VectorAssembler(inputCols=[f"{col}_index" for col in ['region', 'county', 'market', 'category', 'commodity', 'unit']] + ['latitude', 'longitude'],
                            outputCol='features', handleInvalid='skip')

In [31]:
pipeline = Pipeline(stages=indexers + [assembler])

In [32]:
pipeline_model = pipeline.fit(sample_df)
transformed_sample_df = pipeline_model.transform(sample_df)

In [33]:
predictions.select('prediction').show(1)

+------------------+
|        prediction|
+------------------+
|23.398327852942614|
+------------------+
only showing top 1 row



# GIt Version Control Setup

# brc0d3s (dev Branch)

In [34]:
%cd /content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets

/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets


In [None]:
!git pull origin main

In [None]:
!git add .

In [None]:
!git config --global user.email "brc0d3s@gmail.com"
!git config --global user.name "brc0d3s"

In [None]:
!git commit -m "model"

In [None]:
!git push origin dev

# barth123 (barth Branch)

In [None]:
%cd /content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets

In [None]:
!git pull

In [None]:
!git add .

In [None]:
!git commit -m "Data Cleaning"

In [None]:
!git push origin dev