In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName("foodModel").getOrCreate()

# Load Data

In [6]:
prices = '/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/clean_data/food_prices.csv'
climate = '/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/clean_data/climate_data.csv'
production = '/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/clean_data/food_production.csv'


food_prices = spark.read.csv(prices, header=True, inferSchema=True)
climate_data = spark.read.csv(climate, header=True, inferSchema=True)
production_data = spark.read.csv(production, header=True, inferSchema=True)

In [7]:
food_prices.show(3)

+----------+------+-------+-----------+-----+---------+------------------+-------------+----------------+-----------+---------------+
|      date|region| county|market_name|  lat|      lon|     food_category|    food_item|measurement_unit|local_price|normalized_unit|
+----------+------+-------+-----------+-----+---------+------------------+-------------+----------------+-----------+---------------+
|2006-01-15| Coast|Mombasa|    Mombasa|-4.05|39.666667|cereals and tubers|        Maize|              KG|      16.13|            1.0|
|2006-01-15| Coast|Mombasa|    Mombasa|-4.05|39.666667|cereals and tubers|Maize (white)|           90 KG|     1480.0|           90.0|
|2006-01-15| Coast|Mombasa|    Mombasa|-4.05|39.666667|   pulses and nuts|        Beans|              KG|      33.63|            1.0|
+----------+------+-------+-----------+-----+---------+------------------+-------------+----------------+-----------+---------------+
only showing top 3 rows



In [8]:
climate_data.show(3)

+----+-----+-----------+-------------+
|year|month|rainfall_mm|temperature_c|
+----+-----+-----------+-------------+
|1991|  Jan|    38.2847|      25.1631|
|1991|  Feb|    12.7492|      26.0839|
|1991|  Mar|    73.3656|      26.2236|
+----+-----+-----------+-------------+
only showing top 3 rows



In [9]:
production_data.show(3)

+------+--------------------+----------------+----------------+----+
|county|                crop|production_value|measurement_unit|year|
+------+--------------------+----------------+----------------+----+
|  Embu|Pearl Millet (Pro...|          6195.5|          Tonnes|2023|
| Kitui|Pearl Millet (Pro...|        5110.755|          Tonnes|2017|
|  Meru|Pearl Millet (Pro...|         7672.12|          Tonnes|2023|
+------+--------------------+----------------+----------------+----+
only showing top 3 rows



# Data Preprocessing

In [21]:
food_prices.count()

12824

In [22]:
climate_data.count()

312

In [23]:
production_data.count()

500

In [24]:
# Check distinct years and months for food_prices
food_prices.select("year", "month").distinct().show()

# Check distinct years and months for climate_data
climate_data.select("year", "month").distinct().show()

# Check distinct counties and years for production_data
production_data.select("county", "year").distinct().show()


+----+-----+
|year|month|
+----+-----+
|2022|   10|
|2007|    6|
|2012|   10|
|2010|    7|
|2010|   12|
|2024|    7|
|2015|    2|
|2019|   10|
|2017|    3|
|2008|    8|
|2017|    8|
|2009|   11|
|2014|    4|
|2020|    6|
|2019|    5|
|2017|   10|
|2018|   10|
|2021|    8|
|2015|   12|
|2016|    7|
+----+-----+
only showing top 20 rows

+----+-----+
|year|month|
+----+-----+
|2011|  Feb|
|2013|  Nov|
|2014|  Jul|
|1991|  Jun|
|2003|  Nov|
|2008|  Jun|
|2010|  Jun|
|2011|  Jun|
|2007|  May|
|2008|  Sep|
|2014|  Dec|
|1997|  Jun|
|2012|  Jun|
|2012|  Aug|
|1992|  Dec|
|1993|  Jun|
|1994|  Oct|
|1995|  Mar|
|1996|  Mar|
|2003|  Dec|
+----+-----+
only showing top 20 rows

+-------+----+
| county|year|
+-------+----+
|Samburu|2023|
|Garrisa|2018|
|  Busia|2017|
|Kajiado|2012|
|Kajiado|2013|
|  Busia|2014|
|   Embu|2016|
|Homabay|2018|
|Kajiado|2017|
|Kajiado|2021|
| Vihiga|2023|
|Garrisa|2019|
| Isiolo|2022|
|Homabay|2022|
|   Meru|2023|
| Isiolo|2013|
|Garrisa|2013|
|   Embu|2017|
|Garrisa|

In [10]:
from pyspark.sql.functions import col, year, month
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [11]:
# Extract year and month from date
food_prices = food_prices.withColumn("year", year(col("date"))).withColumn("month", month(col("date")))

In [12]:
# Join datasets on common fields (county, year)
combined_data = food_prices.join(climate_data, ["year", "month"], "inner")
combined_data = combined_data.join(production_data, ["county", "year"], "inner")

In [20]:
combined_data.show(4)

+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+
|county|year|month|date|region|market_name|lat|lon|food_category|food_item|measurement_unit|local_price|normalized_unit|rainfall_mm|temperature_c|crop|production_value|measurement_unit|region_index|food_category_index|food_item_index|
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+



In [13]:
# Handle categorical data
indexers = [
    StringIndexer(inputCol=col, outputCol=col + "_index")
    for col in ["region", "food_category", "food_item"]
]

In [14]:
for indexer in indexers:
    combined_data = indexer.fit(combined_data).transform(combined_data)

In [15]:
# Assemble feature vector
feature_cols = ["lat", "lon", "rainfall_mm", "temperature_c", "production_value", "region_index", "food_category_index", "food_item_index"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(combined_data)

# Spit data

In [16]:
(train_data, test_data) = data.randomSplit([0.8, 0.2], seed=42)

In [19]:
train_data.show(3)

+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+--------+
|county|year|month|date|region|market_name|lat|lon|food_category|food_item|measurement_unit|local_price|normalized_unit|rainfall_mm|temperature_c|crop|production_value|measurement_unit|region_index|food_category_index|food_item_index|features|
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+--------+
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+--------+



# Build Model

In [18]:
print(f"Row count: {combined_data.count()}")
combined_data.show(5)

Row count: 0
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+
|county|year|month|date|region|market_name|lat|lon|food_category|food_item|measurement_unit|local_price|normalized_unit|rainfall_mm|temperature_c|crop|production_value|measurement_unit|region_index|food_category_index|food_item_index|
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+
+------+----+-----+----+------+-----------+---+---+-------------+---------+----------------+-----------+---------------+-----------+-------------+----+----------------+----------------+------------+-------------------+---------------+

