# Food Price Data Source

[WFP Food Prices Kenya Dataset](https://data.humdata.org/dataset/e0d3fba6-f9a2-45d7-b949-140c455197ff/resource/517ee1bf-2437-4f8c-aa1b-cb9925b9d437/download/wfp_food_prices_ken.csv)

# -------------------------------------------------------------------------------------------






In [18]:
!pip install pyspark



In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions  import col,to_date,regexp_replace

spark = SparkSession.builder.appName('FoodPricePrediction').master('local[*]').getOrCreate()

spark.sparkContext.appName

'FoodPricePrediction'

In [21]:
data = spark.read.csv("drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/wfp_food_prices_ken.csv",inferSchema=True,header=True)
data.printSchema()

root
 |-- date: string (nullable = true)
 |-- admin1: string (nullable = true)
 |-- admin2: string (nullable = true)
 |-- market: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- category: string (nullable = true)
 |-- commodity: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- priceflag: string (nullable = true)
 |-- pricetype: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: string (nullable = true)
 |-- usdprice: string (nullable = true)



In [22]:
data.show(5)

+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|      date|    admin1|    admin2|          market|latitude|longitude|          category|    commodity|      unit|       priceflag|       pricetype| currency| price|  usdprice|
+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|     #date|#adm1+name|#adm2+name|#loc+market+name|#geo+lat| #geo+lon|        #item+type|   #item+name|#item+unit|#item+price+flag|#item+price+type|#currency|#value|#value+usd|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|        Maize|        KG|          actual|       Wholesale|      KES| 16.13|    0.2235|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|Maize (white)|     90 KG| 

In [23]:
# Check initial data size
data.count(), len(data.columns)

(12865, 14)

In [24]:
# Drop unnecessary columns
data = data.drop("currency", "usdprice", "pricetype")

In [25]:
# Rename columns for clarity
data = data.withColumnRenamed("admin1", "province").withColumnRenamed("admin2", "district")

In [26]:
from pyspark.sql.functions import year, month

# Extract year and month from date column
data = data.withColumn("year", year(col("date"))).withColumn("month", month(col("date")))

In [27]:
# Drop original date column
data = data.drop("date")

In [28]:
from pyspark.sql.functions import when

# One-hot encoding for 'category' field
categories = data.select("category").distinct().rdd.flatMap(lambda x: x).collect()
for category in categories:
    data = data.withColumn(category, when(col("category") == category, 1).otherwise(0))

In [29]:
# Drop original 'category' field
data = data.drop("category")

In [30]:
# Show transformed data
data.show()

+-------------+----------+----------------+---------+---------+--------------------+----------+----------------+-------+----+-----+--------------+---------------+--------+-------------------+----------+---------------------+------------+------------------+------------------+
|     province|  district|          market| latitude|longitude|           commodity|      unit|       priceflag|  price|year|month|milk and dairy|pulses and nuts|non-food|meat, fish and eggs|#item+type|vegetables and fruits|oil and fats|cereals and tubers|miscellaneous food|
+-------------+----------+----------------+---------+---------+--------------------+----------+----------------+-------+----+-----+--------------+---------------+--------+-------------------+----------+---------------------+------------+------------------+------------------+
|   #adm1+name|#adm2+name|#loc+market+name| #geo+lat| #geo+lon|          #item+name|#item+unit|#item+price+flag| #value|NULL| NULL|             0|              0|       0| 

In [31]:
# Save the cleaned and preprocessed data to CSV
output_path = "drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/cleaned_data.csv"
data.write.csv(output_path, header=True)

# GIt Version Control Setup

# brc0d3s (dev Branch)

In [None]:
%cd /content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets

/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets


In [None]:
!git pull origin main

remote: Enumerating objects: 3, done.[K
remote: Counting objects:  33% (1/3)[Kremote: Counting objects:  66% (2/3)[Kremote: Counting objects: 100% (3/3)[Kremote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects:  33% (1/3)[Kremote: Compressing objects:  66% (2/3)[Kremote: Compressing objects: 100% (3/3)[Kremote: Compressing objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects:  33% (1/3)Unpacking objects:  66% (2/3)Unpacking objects: 100% (3/3)Unpacking objects: 100% (3/3), 2.60 KiB | 45.00 KiB/s, done.
From https://github.com/brc0d3s/Distributed-Food-Price-Prediction-for-Kenyan-Markets
 * branch            main       -> FETCH_HEAD
   cdc847b..900cd52  main       -> origin/main
Updating 3056fb5..900cd52
Fast-forward


In [None]:
!git add .

In [None]:
!git config --global user.email "brc0d3s@gmail.com"
!git config --global user.name "brc0d3s"

In [None]:
!git commit -m "Data Cleaning"

[dev 44b1be2] Data Cleaning
 16 files changed, 12827 insertions(+), 13640 deletions(-)
 rewrite Food_Data_Cleaning.ipynb (94%)
 create mode 100644 Modelling.ipynb
 delete mode 100644 clean_data/climate_data.csv/.part-00000-dfa16bb0-c078-4d82-8353-fe1ecc67d3df-c000.csv.crc
 delete mode 100644 clean_data/climate_data.csv/part-00000-dfa16bb0-c078-4d82-8353-fe1ecc67d3df-c000.csv
 rename clean_data/{climate_data.csv => food_combined_data.csv}/._SUCCESS.crc (100%)
 create mode 100644 clean_data/food_combined_data.csv/.part-00000-6bee3114-8903-45e5-8d6d-e6fc9b73c4ec-c000.csv.crc
 rename clean_data/{climate_data.csv => food_combined_data.csv}/_SUCCESS (100%)
 create mode 100644 clean_data/food_combined_data.csv/part-00000-6bee3114-8903-45e5-8d6d-e6fc9b73c4ec-c000.csv
 delete mode 100644 clean_data/food_prices.csv/._SUCCESS.crc
 delete mode 100644 clean_data/food_prices.csv/.part-00000-934ad9ce-ad52-4d51-b689-edae1cf98961-c000.csv.crc
 delete mode 100644 clean_data/food_prices.csv/_SUCCESS
 del

In [None]:
!git push origin dev

Enumerating objects: 13, done.
Counting objects:   7% (1/13)Counting objects:  15% (2/13)Counting objects:  23% (3/13)Counting objects:  30% (4/13)Counting objects:  38% (5/13)Counting objects:  46% (6/13)Counting objects:  53% (7/13)Counting objects:  61% (8/13)Counting objects:  69% (9/13)Counting objects:  76% (10/13)Counting objects:  84% (11/13)Counting objects:  92% (12/13)Counting objects: 100% (13/13)Counting objects: 100% (13/13), done.
Delta compression using up to 2 threads
Compressing objects: 100% (7/7), done.
Writing objects: 100% (10/10), 123.97 KiB | 1.82 MiB/s, done.
Total 10 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/brc0d3s/Distributed-Food-Price-Prediction-for-Kenyan-Markets.git
   3056fb5..44b1be2  dev -> dev


# barth123 (barth Branch)

In [None]:
%cd /content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets

/content/drive/MyDrive


In [None]:
!git pull

'Barth ATS standard Resume (1).pdf'	      housing.csv
'Barth ATS standard Resume.pdf'		      housing.gsheet
'barth cv.docx'				      IMG_20250121_172040_147.jpg
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (1).docx'   Lab1.ipynb
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (2).docx'   Relizane_Data.xlsx
 BATHOLOMEN-OGUTU-NYONGESA-REPORT.docx	     'Scan 01 Dec 21 · 03·59·05.pdf'
'Batholomew Nyongesa cv.docx'		     'Transcript-1046075 (2).pdf'
'Batholomew Nyongesa cv.pdf'		     'Transcript-1046075 (3).pdf'
 CoinbaseWalletBackups			      TrustWalletBackup
'Colab Notebooks'			      Untitled0.ipynb
 DMLLabworks.ipynb			     'Untitled document.gdoc'
'Getting started.pdf'			     'Untitled spreadsheet.gsheet'


In [None]:
!git add .

In [None]:
!git commit -m "Data Cleaning"

[dev 3056fb5] Data Cleaning
 3 files changed, 1 insertion(+), 1 deletion(-)
 create mode 100644 Abstract/ABSTRACT_GROUP20.docx
 create mode 100644 Abstract/ABSTRACT_GROUP20.pdf


In [None]:
!git push origin dev

Enumerating objects: 8, done.
Counting objects:  12% (1/8)Counting objects:  25% (2/8)Counting objects:  37% (3/8)Counting objects:  50% (4/8)Counting objects:  62% (5/8)Counting objects:  75% (6/8)Counting objects:  87% (7/8)Counting objects: 100% (8/8)Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects:  16% (1/6)Compressing objects:  33% (2/6)Compressing objects:  50% (3/6)Compressing objects:  66% (4/6)Compressing objects:  83% (5/6)Compressing objects: 100% (6/6)Compressing objects: 100% (6/6), done.
Writing objects:  16% (1/6)Writing objects:  33% (2/6)Writing objects:  50% (3/6)Writing objects:  66% (4/6)Writing objects:  83% (5/6)Writing objects: 100% (6/6)Writing objects: 100% (6/6), 114.97 KiB | 3.48 MiB/s, done.
Total 6 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas:   0% (0/2)[Kremote: Resolving deltas:  50% (1/2)[Kremote: Resolving deltas: 100% (2/2)[Kremote: Resolving deltas: 100%