# Data Source

[WFP Food Prices Kenya Dataset](https://data.humdata.org/dataset/e0d3fba6-f9a2-45d7-b949-140c455197ff/resource/517ee1bf-2437-4f8c-aa1b-cb9925b9d437/download/wfp_food_prices_ken.csv)

# -------------------------------------------------------------------------------------------

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions  import col,to_date,regexp_replace

spark = SparkSession.builder.appName('FoodPricePrediction').master('local[*]').getOrCreate()

spark.sparkContext.appName

'FoodPricePrediction'

In [None]:
food_prices = spark.read.csv("drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets/data/wfp_food_prices_ken.csv",inferSchema=True,header=True)
food_prices.printSchema()

root
 |-- date: string (nullable = true)
 |-- admin1: string (nullable = true)
 |-- admin2: string (nullable = true)
 |-- market: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- category: string (nullable = true)
 |-- commodity: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- priceflag: string (nullable = true)
 |-- pricetype: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: string (nullable = true)
 |-- usdprice: string (nullable = true)



In [None]:
food_prices.count()

12821

In [None]:
food_prices.show(30)

+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|      date|     admin1|     admin2|              market| latitude|longitude|          category|           commodity|      unit|       priceflag|       pricetype| currency|  price|  usdprice|
+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|     #date| #adm1+name| #adm2+name|    #loc+market+name| #geo+lat| #geo+lon|        #item+type|          #item+name|#item+unit|#item+price+flag|#item+price+type|#currency| #value|#value+usd|
|2006-01-15|      Coast|    Mombasa|             Mombasa|    -4.05|39.666667|cereals and tubers|               Maize|        KG|          actual|       Wholesale|      KES|  16.13|    0.2235|
|2006-01-15|      Coast|    Mombasa|    

## Filter rows where any column is NULL or an empty string

In [None]:
missing_rows = food_prices.filter(
    (col("date").isNull()) | (col("date") == "") |
    (col("admin1").isNull()) | (col("admin1") == "") |
    (col("admin2").isNull()) | (col("admin2") == "") |
    (col("price").isNull()) | (col("price") == "") |
    (col("usdprice").isNull()) | (col("usdprice") == "")
)

missing_rows.show(truncate=False)

+----------+------+------+-----------------+--------+---------+------------------+-------------+----+---------+---------+--------+-----+--------+
|date      |admin1|admin2|market           |latitude|longitude|category          |commodity    |unit|priceflag|pricetype|currency|price|usdprice|
+----------+------+------+-----------------+--------+---------+------------------+-------------+----+---------+---------+--------+-----+--------+
|2015-05-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |38.0 |0.3967  |
|2015-06-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |36.0 |0.3708  |
|2015-07-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |42.0 |0.4126  |
|2015-08-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES

In [None]:
#Drop NuLL values

food_data = food_prices.dropna()

In [None]:
food_data.show(50)

+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|      date|     admin1|     admin2|              market| latitude|longitude|          category|           commodity|      unit|       priceflag|       pricetype| currency|  price|  usdprice|
+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|     #date| #adm1+name| #adm2+name|    #loc+market+name| #geo+lat| #geo+lon|        #item+type|          #item+name|#item+unit|#item+price+flag|#item+price+type|#currency| #value|#value+usd|
|2006-01-15|      Coast|    Mombasa|             Mombasa|    -4.05|39.666667|cereals and tubers|               Maize|        KG|          actual|       Wholesale|      KES|  16.13|    0.2235|
|2006-01-15|      Coast|    Mombasa|    

In [None]:
food_data.count()

12772

In [None]:
food_data.show(40)

+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|      date|     admin1|     admin2|              market| latitude|longitude|          category|           commodity|      unit|       priceflag|       pricetype| currency|  price|  usdprice|
+----------+-----------+-----------+--------------------+---------+---------+------------------+--------------------+----------+----------------+----------------+---------+-------+----------+
|     #date| #adm1+name| #adm2+name|    #loc+market+name| #geo+lat| #geo+lon|        #item+type|          #item+name|#item+unit|#item+price+flag|#item+price+type|#currency| #value|#value+usd|
|2006-01-15|      Coast|    Mombasa|             Mombasa|    -4.05|39.666667|cereals and tubers|               Maize|        KG|          actual|       Wholesale|      KES|  16.13|    0.2235|
|2006-01-15|      Coast|    Mombasa|    

# GIt Version Control Setup

In [None]:
!ls

drive  sample_data


In [3]:
%cd /content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets
!pwd

/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets
/content/drive/MyDrive/Distributed-Food-Price-Prediction-for-Kenyan-Markets


In [4]:
!git pull origin main

From https://github.com/brc0d3s/Distributed-Food-Price-Prediction-for-Kenyan-Markets
 * branch            main       -> FETCH_HEAD
Already up to date.


In [5]:
!git add .

In [6]:
!git commit -m "Data Cleaning"

Author identity unknown

*** Please tell me who you are.

Run

  git config --global user.email "you@example.com"
  git config --global user.name "Your Name"

to set your account's default identity.
Omit --global to set the identity only in this repository.

fatal: unable to auto-detect email address (got 'root@7183e658e9b7.(none)')


In [8]:
!git config --global user.email "brc0d3s@gmail.com"

In [None]:
 git config --global user.name "Your Name"

In [7]:
!git push origin dev

Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/brc0d3s/Distributed-Food-Price-Prediction-for-Kenyan-Markets.git
   6e8ea55..cdc847b  dev -> dev


In [None]:
!ls

drive  sample_data


In [None]:
%cd drive/MyDrive/

/content/drive/MyDrive


In [None]:
!ls

'Barth ATS standard Resume (1).pdf'	      housing.csv
'Barth ATS standard Resume.pdf'		      housing.gsheet
'barth cv.docx'				      IMG_20250121_172040_147.jpg
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (1).docx'   Lab1.ipynb
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (2).docx'   Relizane_Data.xlsx
 BATHOLOMEN-OGUTU-NYONGESA-REPORT.docx	     'Scan 01 Dec 21 · 03·59·05.pdf'
'Batholomew Nyongesa cv.docx'		     'Transcript-1046075 (2).pdf'
'Batholomew Nyongesa cv.pdf'		     'Transcript-1046075 (3).pdf'
 CoinbaseWalletBackups			      TrustWalletBackup
'Colab Notebooks'			      Untitled0.ipynb
 DMLLabworks.ipynb			     'Untitled document.gdoc'
'Getting started.pdf'			     'Untitled spreadsheet.gsheet'


In [None]:
!git clone https://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@github.com/Barth123/Distributed-Food-Price-Prediction-for-Kenyan-Markets.git

Cloning into 'Distributed-Food-Price-Prediction-for-Kenyan-Markets'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 27 (delta 4), reused 23 (delta 2), pack-reused 0 (from 0)[K
Receiving objects: 100% (27/27), 339.03 KiB | 4.64 MiB/s, done.
Resolving deltas: 100% (4/4), done.


In [None]:
!ls

'Barth ATS standard Resume (1).pdf'		        housing.csv
'Barth ATS standard Resume.pdf'			        housing.gsheet
'barth cv.docx'					        IMG_20250121_172040_147.jpg
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (1).docx'	        Lab1.ipynb
'BATHOLOMEN-OGUTU-NYONGESA-REPORT (2).docx'	        Relizane_Data.xlsx
 BATHOLOMEN-OGUTU-NYONGESA-REPORT.docx		       'Scan 01 Dec 21 · 03·59·05.pdf'
'Batholomew Nyongesa cv.docx'			        token.txt
'Batholomew Nyongesa cv.pdf'			       'Transcript-1046075 (2).pdf'
 CoinbaseWalletBackups				       'Transcript-1046075 (3).pdf'
'Colab Notebooks'				        TrustWalletBackup
 Distributed-Food-Price-Prediction-for-Kenyan-Markets   Untitled0.ipynb
 DMLLabworks.ipynb				       'Untitled document.gdoc'
'Getting started.pdf'				       'Untitled spreadsheet.gsheet'
