In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
data = pd.read_csv('wfp_food_prices_phl.csv')
data.head()

Unnamed: 0,date,admin1,admin2,market,latitude,longitude,category,commodity,unit,priceflag,pricetype,currency,price,usdprice,inflation
0,1/15/00,National Capital region,Metropolitan Manila,Metro Manila,14.604167,120.982222,cereals and tubers,Maize flour (yellow),KG,actual,Retail,PHP,15.0,0.3717,3.98
1,1/15/00,National Capital region,Metropolitan Manila,Metro Manila,14.604167,120.982222,cereals and tubers,"Rice (milled, superior)",KG,actual,Retail,PHP,20.0,0.4957,3.98
2,1/15/00,National Capital region,Metropolitan Manila,Metro Manila,14.604167,120.982222,cereals and tubers,"Rice (milled, superior)",KG,actual,Wholesale,PHP,18.35,0.4548,3.98
3,1/15/00,National Capital region,Metropolitan Manila,Metro Manila,14.604167,120.982222,cereals and tubers,"Rice (regular, milled)",KG,actual,Retail,PHP,18.0,0.4461,3.98
4,1/15/00,National Capital region,Metropolitan Manila,Metro Manila,14.604167,120.982222,cereals and tubers,"Rice (regular, milled)",KG,actual,Wholesale,PHP,16.35,0.4052,3.98


In [2]:
data.columns


Index(['date', 'admin1', 'admin2', 'market', 'latitude', 'longitude',
       'category', 'commodity', 'unit', 'priceflag', 'pricetype', 'currency',
       'price', 'usdprice', 'inflation'],
      dtype='object')

### Data preprocessing
Perform data preprocessing and identify columns to be used. You may or may not use all the columns. Prepare the features and target data.
Prepare the train and test data.

In [3]:
data.dtypes

date          object
admin1        object
admin2        object
market        object
latitude     float64
longitude    float64
category      object
commodity     object
unit          object
priceflag     object
pricetype     object
currency      object
price        float64
usdprice     float64
inflation    float64
dtype: object

In [4]:
#Change the datatype of date and get the year and month
data['date'] = data['date'].astype('datetime64[ns]')
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month

In [5]:
#drop all the null values and the rows with the price = 0
data.dropna(inplace= True)
data.drop(data.loc[data['price']==0].index, inplace=True)

In [6]:
#drop the unnecessary columns
data = data.drop(['date', 'admin1','admin2','market','category', 'unit', 'currency'
        ,'usdprice'], axis='columns')

#Get dummy values for the categorical columns
data = pd.get_dummies(data=data, drop_first=True)

#divide the data into test and training sets
X = data.drop(['price'], axis='columns')
y = data['price'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=26)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
display(X_train)

(109930, 81)
(27483, 81)
(109930,)
(27483,)


Unnamed: 0,latitude,longitude,inflation,year,month,commodity_Bananas (lakatan),commodity_Bananas (latundan),commodity_Bananas (saba),"commodity_Beans (green, fresh)",commodity_Beans (mung),...,commodity_Sugar (white),commodity_Sweet Potato leaves,commodity_Sweet potatoes,commodity_Taro,commodity_Tomatoes,commodity_Water spinach,"priceflag_actual,aggregate",priceflag_aggregate,pricetype_Retail,pricetype_Wholesale
15312,16.016667,120.233333,3.03,2012,11,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
51810,14.604167,120.982222,2.39,2020,9,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
34435,16.486093,121.146518,2.39,2020,5,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
35778,11.706772,122.370090,2.39,2020,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
49991,8.040911,123.799419,2.39,2020,8,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73408,13.146926,123.750464,3.93,2021,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
137475,10.667360,122.946930,5.80,2022,7,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
77256,10.132101,124.834680,3.93,2021,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
59971,13.137222,123.734444,2.39,2020,10,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


### Perform regression on the dataset.
The goal is to use regression on the dataset with the best result you can.

In [7]:
#Train the model using the training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

### Show results.

In [8]:
#Use the model on the testing set
test_predictions = regressor.predict(X_test)

#get the MAE,MSE,RMSE, and R2 values to evaluate the model 
from sklearn.metrics import mean_absolute_error,mean_squared_error
MAE = mean_absolute_error(y_test,test_predictions)
MSE = mean_squared_error(y_test,test_predictions)
RMSE = np.sqrt(MSE)
from sklearn.metrics import r2_score
r2 = r2_score(y_test,test_predictions)

comparison_df = pd.DataFrame({"Actual":y_test,"Predicted":test_predictions})
display(comparison_df)

print(f"MAE = {MAE}\nMSE = {MSE}\nRMSE = {RMSE}\nr2 = {r2}")

Unnamed: 0,Actual,Predicted
0,238.36,236.943115
1,40.00,49.252930
2,37.50,35.770264
3,42.77,23.841309
4,138.00,205.296997
...,...,...
27478,62.00,47.890259
27479,102.75,141.856934
27480,207.50,187.665771
27481,36.74,27.844360


MAE = 24.161776901715374
MSE = 1476.6919726712076
RMSE = 38.42775003394302
r2 = 0.871490381729406
