# Data Source

[WFP Food Prices Kenya Dataset](https://data.humdata.org/dataset/e0d3fba6-f9a2-45d7-b949-140c455197ff/resource/517ee1bf-2437-4f8c-aa1b-cb9925b9d437/download/wfp_food_prices_ken.csv)


# -------------------------------------------------------------------------------------------

In [12]:
!pip install pyspark



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions  import col,to_date,regexp_replace

spark = SparkSession.builder.appName('FoodPricePrediction').master('local[*]').getOrCreate()

spark.sparkContext.appName

'FoodPricePrediction'

In [34]:
food_prices = spark.read.csv("drive/MyDrive/Project/data/wfp_food_prices_ken.csv",inferSchema=True,header=True)
food_prices.printSchema()

root
 |-- date: string (nullable = true)
 |-- admin1: string (nullable = true)
 |-- admin2: string (nullable = true)
 |-- market: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- category: string (nullable = true)
 |-- commodity: string (nullable = true)
 |-- unit: string (nullable = true)
 |-- priceflag: string (nullable = true)
 |-- pricetype: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- price: string (nullable = true)
 |-- usdprice: string (nullable = true)



In [27]:
food_prices.count()

12865

In [31]:
food_prices.show(5)

+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|      date|    admin1|    admin2|          market|latitude|longitude|          category|    commodity|      unit|       priceflag|       pricetype| currency| price|  usdprice|
+----------+----------+----------+----------------+--------+---------+------------------+-------------+----------+----------------+----------------+---------+------+----------+
|     #date|#adm1+name|#adm2+name|#loc+market+name|#geo+lat| #geo+lon|        #item+type|   #item+name|#item+unit|#item+price+flag|#item+price+type|#currency|#value|#value+usd|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|        Maize|        KG|          actual|       Wholesale|      KES| 16.13|    0.2235|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|Maize (white)|     90 KG| 

## Filter rows where any column is NULL or an empty string

In [24]:
missing_rows = food_prices.filter(
    (col("date").isNull()) | (col("date") == "") |
    (col("admin1").isNull()) | (col("admin1") == "") |
    (col("admin2").isNull()) | (col("admin2") == "") |
    (col("price").isNull()) | (col("price") == "") |
    (col("usdprice").isNull()) | (col("usdprice") == "")
)

missing_rows.show(truncate=False)

+----------+------+------+-----------------+--------+---------+------------------+-------------+----+---------+---------+--------+-----+--------+
|date      |admin1|admin2|market           |latitude|longitude|category          |commodity    |unit|priceflag|pricetype|currency|price|usdprice|
+----------+------+------+-----------------+--------+---------+------------------+-------------+----+---------+---------+--------+-----+--------+
|2015-05-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |38.0 |0.3967  |
|2015-06-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |36.0 |0.3708  |
|2015-07-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES     |42.0 |0.4126  |
|2015-08-15|NULL  |NULL  |Hola (Tana River)|NULL    |NULL     |cereals and tubers|Maize (white)|KG  |actual   |Retail   |KES

In [25]:
#Drop NuLL values

food_data = food_prices.dropna()

In [26]:
food_data.show(2)

+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+----------------+---------+------+----------+
|      date|    admin1|    admin2|          market|latitude|longitude|          category| commodity|      unit|       priceflag|       pricetype| currency| price|  usdprice|
+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+----------------+---------+------+----------+
|     #date|#adm1+name|#adm2+name|#loc+market+name|#geo+lat| #geo+lon|        #item+type|#item+name|#item+unit|#item+price+flag|#item+price+type|#currency|#value|#value+usd|
|2006-01-15|     Coast|   Mombasa|         Mombasa|   -4.05|39.666667|cereals and tubers|     Maize|        KG|          actual|       Wholesale|      KES| 16.13|    0.2235|
+----------+----------+----------+----------------+--------+---------+------------------+----------+----------+----------------+--

In [28]:
food_data.count()

12825

# GIt Version Control Setup

In [51]:
!ls

drive  sample_data


In [57]:
%cd /content/drive/MyDrive/Project
!pwd

/content/drive/MyDrive/Project
/content/drive/MyDrive/Project


In [58]:
!ls

data  env  external_factors_data  Food_Data_Cleaning.ipynb  README.md


In [59]:
!git init

[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/drive/MyDrive/Project/.git/


In [60]:
!git add .

In [61]:
!git branch -M main

In [None]:
!git remote add origin https://brc0d3s:<YO@github.com/brc0d3s/Distributed-Food-Price-Prediction-for-Kenyan-Markets.git