## BAM VS. AIRQO

### Stationary monitors

1.     Hourly comparison

2.     Daily  comparison

### Importing relevant packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.dates as mdates
%matplotlib inline
import re
import os
import datetime
from sklearn.linear_model import LinearRegression  
import seaborn as seabornInstance 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from scipy import stats
from scipy.stats import pearsonr
import joblib
import seaborn as sns 
from scipy.optimize import curve_fit
import uncertainties.unumpy as unp
import uncertainties as unc

### Changing default colormap - matplotlib

In [2]:
# to change default colormap
plt.rcParams["image.cmap"] = "Set1"
# to change default color cycle
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=plt.cm.Set1.colors)

### display data frame info

In [3]:
def show_dataframe_info(df):
    print(df.shape)
    print(list(df.columns.values))
    print('\n dataframe info: \n', df.info())
    print('\n dataframe column datatypes:\n', df.dtypes)

### Reading data

In [4]:
muk_lowcost_data = pd.read_csv('AQ_88.csv') #channel 88-thingspeak,  device colocated with MUK BAM
us_embassy_lowcost_data = pd.read_csv('AQ_51.csv') # #channel 51-thingspeak, device collocated with the US embassy BAM
muk_bam_data = pd.read_csv('MUK-BAM.csv')#MUK BAM
us_embassy_bam_data = pd.read_csv('Kampala_PM2.5_2017_2018_2019_2020.csv')#US Embassy BAM
muk_lowcost_data.head(10)

FileNotFoundError: [Errno 2] File AQ_88.csv does not exist: 'AQ_88.csv'

### Renaming columns

In [None]:
muk_lowcost_data.rename(columns={'field1':'Sensor1 PM2.5_CF_1_ug/m3','field2':'Sensor1 PM10_CF_1_ug/m3',
                                     'field3':'Sensor2PM2.5_CF_1_ug/m3', 'field4':'Sensor2 PM10_CF_1_ug/m3',
                                     'field5': 'Latitude', 'field6':'Longitude', 'field7':'Battery Voltage', 
                                     'field8':'GpsData'},inplace=True)
muk_lowcost_data = muk_lowcost_data.drop(['entry_id','Latitude', 'Longitude', 'Battery Voltage','GpsData','latitude','longitude', 'elevation','status'], axis=1)
muk_lowcost_data['Average_PM2.5_CF_1_ug/m3'] = muk_lowcost_data[['Sensor1 PM2.5_CF_1_ug/m3', 'Sensor2PM2.5_CF_1_ug/m3']].mean(axis=1).round(2)
muk_lowcost_data['Average_PM10_CF_1_ug/m3'] = muk_lowcost_data[['Sensor1 PM10_CF_1_ug/m3', 'Sensor2 PM10_CF_1_ug/m3']].mean(axis=1).round(2)

muk_lowcost_data.tail()


In [None]:
us_embassy_lowcost_data.rename(columns={'field1':'Sensor1 PM2.5_CF_1_ug/m3','field2':'Sensor1 PM10_CF_1_ug/m3',
                                     'field3':'Sensor2PM2.5_CF_1_ug/m3', 'field4':'Sensor2 PM10_CF_1_ug/m3',
                                     'field5': 'Latitude', 'field6':'Longitude', 'field7':'Battery Voltage', 
                                     'field8':'GpsData'},inplace=True)
us_embassy_lowcost_data = us_embassy_lowcost_data.drop(['entry_id','Latitude', 'Longitude', 'Battery Voltage','GpsData','latitude','longitude', 'elevation','status'], axis=1)
us_embassy_lowcost_data['Average_PM2.5_CF_1_ug/m3'] = us_embassy_lowcost_data[['Sensor1 PM2.5_CF_1_ug/m3', 'Sensor2PM2.5_CF_1_ug/m3']].mean(axis=1).round(2)
us_embassy_lowcost_data['Average_PM10_CF_1_ug/m3'] = us_embassy_lowcost_data[['Sensor1 PM10_CF_1_ug/m3', 'Sensor2 PM10_CF_1_ug/m3']].mean(axis=1).round(2)

us_embassy_lowcost_data.tail()


### Data exploration

In [None]:
show_dataframe_info(muk_lowcost_data)

In [None]:
show_dataframe_info(us_embassy_lowcost_data)

In [None]:
show_dataframe_info(muk_bam_data)

In [None]:
show_dataframe_info(us_embassy_bam_data)

# Makerere low cost device

### Setting timestamp as index 

In [None]:
muk_lowcost_data["TimeStamp"] = pd.to_datetime(muk_lowcost_data["created_at"])
muk_lowcost_data.drop_duplicates(subset ="TimeStamp", keep = 'first', inplace = True)
muk_lowcost_data = muk_lowcost_data.set_index('TimeStamp')
muk_lowcost_data = muk_lowcost_data.drop(['created_at'], axis=1)
muk_lowcost_data.head()

###  Filter out outliers, get values between 0 and  500.4 

In [None]:
muk_lowcost_data = muk_lowcost_data[muk_lowcost_data['Average_PM2.5_CF_1_ug/m3'] <= 500.4]
muk_lowcost_data = muk_lowcost_data[muk_lowcost_data['Average_PM2.5_CF_1_ug/m3'] > 0]

In [None]:
muk_lowcost_data = muk_lowcost_data.loc['2020-07-15 13:00:00':'2020-08-26 09:00:00']

In [None]:
muk_lowcost_data.shape

### Get hourly concetration values

In [None]:
muk_lowcost_hourly_std  = muk_lowcost_data.resample('H').std().round(2)
muk_lowcost_hourly_mean = muk_lowcost_data.resample('H').mean().round(2)

In [None]:
fig = plt.figure(figsize=(14,6))
plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
plt.plot(muk_lowcost_hourly_mean.index, 
         muk_lowcost_hourly_mean['Average_PM2.5_CF_1_ug/m3'], linestyle='solid', label='PM2.5 Average Values(lowcst)') #marker='o', 

chart_title =  'Hourly PM2.5 Concentration 15th Jul to 26th Aug 2020 for MUK lowcost'
plt.title(chart_title,fontsize=20)
plt.ylabel('PM2.5', fontsize=20)
plt.xlabel('Date', fontsize=20)
plt.xticks(rotation=0)
plt.legend()
plt.show()
plt.tight_layout()
fig.savefig('hourly_Muk_lowcost.png')

### Get daily concetration values

In [None]:
muk_lowcost_daily_std    = muk_lowcost_data.resample('D').std().round(2)
muk_lowcost_daily_mean   = muk_lowcost_data.resample('D').mean().round(2)

In [None]:
    fig = plt.figure(figsize=(14,6))
    plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
    plt.plot(muk_lowcost_daily_mean.index, 
             muk_lowcost_daily_mean['Average_PM2.5_CF_1_ug/m3'], marker='o', linestyle='solid', label='PM2.5 Average Values')
       
    chart_title =  'Daily PM2.5 Concentration 15th July to 26th 2020 for MUK lowcost'
    plt.title(chart_title,fontsize=20)
    plt.ylabel('PM2.5 Concentration', fontsize=20)
    plt.xlabel('Date', fontsize=20)
    plt.xticks(rotation=0)
    plt.legend()
    plt.show()
    fig.savefig(os.path.join('daily_concentration_muk_lowcost.png'))

## MUK BAM

In [None]:
muk_bam_data.columns

### Drop extra columns

In [None]:
#print(airquality_data_copy.columns)
muk_bam_data = muk_bam_data.drop(['Flow(lpm)', 'WS(m/s)', 'WD(Deg)', 'BP(mmHg)', 'FT(C)', 'FRH(%)', 'Status'], axis=1)

muk_bam_data.head()

### Set timestamp as index

In [None]:
muk_bam_data["TimeStamp"] = pd.to_datetime(muk_bam_data["Time"])
muk_bam_data.drop_duplicates(subset ="TimeStamp", keep = 'first', inplace = True)
muk_bam_data = muk_bam_data.set_index('TimeStamp')
muk_bam_data = muk_bam_data.drop(['Time'], axis=1)
muk_bam_data.tail()

### Filter out outliers

In [None]:
muk_bam_data = muk_bam_data[muk_bam_data['ConcHR(ug/m3)'] <= 500.4]
muk_bam_data = muk_bam_data[muk_bam_data['ConcHR(ug/m3)'] > 0]

### Get data from same date range for both devices - hourly

In [None]:
## get the lower boundary date 
muk_lowcost_hourly_mean.iloc[:1,:]
t= muk_lowcost_hourly_mean.iloc[:1,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
muk_lowcost_hourly_mean_lower_date = z.strftime('%Y-%m-%d %H:%M:%S')

print(muk_lowcost_hourly_mean_lower_date)

## get the upper boundary date 
t = muk_lowcost_hourly_mean.iloc[-1:,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
muk_lowcost_hourly_mean_upper_date = z.strftime('%Y-%m-%d %H:%M:%S')
print(muk_lowcost_hourly_mean_upper_date)

In [None]:
## get the lower boundary date 
muk_bam_data.iloc[:1,:]
t= muk_bam_data.iloc[:1,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
muk_bam_data_lower_date = z.strftime('%Y-%m-%d %H:%M:%S')

print(muk_bam_data_lower_date)

## get the upper boundary date
t = muk_bam_data.iloc[-1:,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
muk_bam_data_upper_date = z.strftime('%Y-%m-%d %H:%M:%S')
print(muk_bam_data_upper_date)

In [None]:
hourly_same_daterange = muk_bam_data[muk_lowcost_hourly_mean_lower_date:muk_lowcost_hourly_mean_upper_date]
hourly_same_daterange.head()

In [None]:
hourly_same_daterange.shape

### Add time column for merging  BAM and Low cost data - hourly

In [None]:
muk_lowcost_hourly_mean  = muk_lowcost_hourly_mean.drop(
['Sensor1 PM10_CF_1_ug/m3','Sensor2 PM10_CF_1_ug/m3','Average_PM10_CF_1_ug/m3', ], axis=1)
hourly_timestamp = muk_lowcost_hourly_mean.index.values
muk_lowcost_hourly_mean["Time"] = hourly_timestamp
muk_lowcost_hourly_mean["Time"] = pd.to_datetime(muk_lowcost_hourly_mean["Time"])
muk_lowcost_hourly_mean.head()

In [None]:
hourly_BAM_data =  hourly_same_daterange
hourly_timestamp = hourly_BAM_data.index.values
hourly_BAM_data["Time"] = hourly_timestamp
hourly_BAM_data["Time"] = pd.to_datetime(hourly_BAM_data["Time"])

hourly_BAM_data.head()

In [None]:
    fig = plt.figure(figsize=(14,6))
    plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
    plt.plot(hourly_BAM_data.index, 
             hourly_BAM_data['ConcHR(ug/m3)'],  linestyle='solid', label='PM2.5 Values(BAM)')
    
    chart_title =  'AirQo-BAM PM2.5 Hourly Concentration 15th July to 26th Aug 2020'
    plt.title(chart_title,fontsize=20)
    plt.ylabel('PM2.5', fontsize=20)
    plt.xlabel('Date', fontsize=20)
    plt.xticks(rotation=0)
    plt.legend()
    plt.tight_layout()
    plt.show()
    fig.savefig('hourly_pm25_concentration_MUK_BAM')

### Combining BAM and lowcost datasets

In [None]:
hourly_combined_dataset= pd.merge(muk_lowcost_hourly_mean, hourly_BAM_data, on='Time')
hourly_combined_dataset.rename(columns={'Average_PM2.5_CF_1_ug/m3':'muk_lowcost_hourly_PM','ConcHR(ug/m3)':'muk_bam_hourly_PM'},inplace=True)

hourly_combined_dataset.to_csv('hourly_pm25_concentration_MUK_lowcost_vs_MUK_BAM.csv')
 
hourly_combined_dataset.head()



### Accounting for time difference between the lowcost and BAM

In [None]:
hourly_combined_dataset['muk_bam_hourly_PM'] = hourly_combined_dataset['muk_bam_hourly_PM'].shift(-1)

### check the difference in values

In [None]:
hourly_combined_dataset['PM25_Difference'] = hourly_combined_dataset['muk_lowcost_hourly_PM'] - hourly_combined_dataset['muk_bam_hourly_PM']

### Visualise BAM vs lowcost data for MUK

In [None]:
    fig = plt.figure(figsize=(14,6))
    plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
    plt.plot(hourly_combined_dataset.Time, 
             hourly_combined_dataset['muk_lowcost_hourly_PM'], linestyle='solid', label='PM2.5 Average Values(Lowcost)') #marker='o', 
    plt.plot(hourly_combined_dataset.Time, 
             hourly_combined_dataset['muk_bam_hourly_PM'], linestyle='solid', label='PM2.5 Values(BAM)') # marker='x',
    
    chart_title =  'Lowcost Vs BAM PM2.5 Measurements for MUK'
    plt.title(chart_title,fontsize=20)
    plt.ylabel('PM2.5', fontsize=20)
    plt.xlabel('Date', fontsize=20)
    plt.xticks(rotation=0)
    plt.legend()
    plt.tight_layout()
    plt.show()
    fig.savefig('hourly_pm25_concentration_comparison_MUK.png')

### Daily Comparisons

In [None]:
muk_bam_daily_std    = hourly_same_daterange.resample('D').std().round(2)
muk_bam_daily_mean   = hourly_same_daterange.resample('D').mean().round(2)

In [None]:
    fig = plt.figure(figsize=(14,6))
    plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
    plt.plot(muk_bam_daily_mean.index, 
             muk_bam_daily_mean['ConcHR(ug/m3)'], marker='o', linestyle='solid', label='PM2.5 Average Values')
      
    chart_title =  'Daily BAM PM2.5 Concentration for MUK'
    plt.title(chart_title,fontsize=20)
    plt.ylabel('PM2.5 Concentration', fontsize=20)
    plt.xlabel('Date', fontsize=20)
    plt.xticks(rotation=0)
    plt.legend()
    plt.show()
    fig.savefig(os.path.join('daily_pm25_concentration_BAM_.png'))

### Add time column for merging  BAM and Low cost data - daily

In [None]:
daily_lowcost_data = muk_lowcost_daily_mean
daily_lowcost_data  = daily_lowcost_data.drop(['Sensor1 PM10_CF_1_ug/m3','Sensor2 PM10_CF_1_ug/m3','Average_PM10_CF_1_ug/m3'], axis=1)
daily_timestamp = daily_lowcost_data.index.values
daily_lowcost_data["Time"] = daily_timestamp
daily_lowcost_data["Time"] = pd.to_datetime(daily_lowcost_data["Time"])
daily_lowcost_data.head()

In [None]:
daily_bam_data = muk_bam_daily_mean
daily_timestamp = daily_bam_data.index.values
daily_bam_data["Time"] = daily_timestamp
daily_bam_data["Time"] = pd.to_datetime(daily_bam_data["Time"])
daily_bam_data.head()

In [None]:
daily_combined_dataset= pd.merge(daily_lowcost_data, daily_bam_data, on='Time')

daily_combined_dataset.rename(
    columns={'Average_PM2.5_CF_1_ug/m3':'muk_lowcost_daily_PM','ConcHR(ug/m3)':'muk_bam_daily_PM'},inplace=True)

daily_combined_dataset.to_csv('Daily_pm25_concentration_airqo_vs_BAM_MUK.csv')
 
daily_combined_dataset.head()

In [None]:
daily_combined_dataset['PM25_difference'] = daily_combined_dataset['muk_lowcost_daily_PM'] - daily_combined_dataset['muk_bam_daily_PM']
daily_combined_dataset['PM25_difference'].head(10)

In [None]:
daily_combined_dataset['muk_lowcost_daily_PM'].head(10)

In [None]:
daily_combined_dataset['muk_bam_daily_PM'].head(10)

### Visualising daily concetration MUK BAM vs Low cost

In [None]:
    fig = plt.figure(figsize=(14,6))
    plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
    plt.plot(daily_combined_dataset.Time, 
             daily_combined_dataset['muk_lowcost_daily_PM'], linestyle='solid', label='PM2.5 Average Values(Low cost)') #marker='o', 
    plt.plot(daily_combined_dataset.Time, 
             daily_combined_dataset['muk_bam_daily_PM'], linestyle='solid', label='PM2.5 Values(BAM)') # marker='x',
    
    chart_title =  'Low cost Vs BAM PM2.5 Daily Measurements for MUK'
    plt.title(chart_title,fontsize=20)
    plt.ylabel('PM2.5', fontsize=20)
    plt.xlabel('Date', fontsize=20)
    plt.xticks(rotation=0)
    plt.legend()
    plt.tight_layout()
    plt.show()
    fig.savefig('daily_pm25_concentration_comparison.png')

# Linear Regression model - Scikit-Learn - Hourly

In [None]:
# take only rows where hourly_PM is not null
hourly_combined_dataset = hourly_combined_dataset[hourly_combined_dataset['muk_lowcost_hourly_PM'].notna()]
hourly_combined_dataset = hourly_combined_dataset[hourly_combined_dataset['muk_bam_hourly_PM'].notna()]

In [None]:
X_muk = hourly_combined_dataset['muk_bam_hourly_PM'].values
X_muk = X_muk.reshape((-1, 1))
y_muk = hourly_combined_dataset['muk_lowcost_hourly_PM'].values

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(y_muk)

In [None]:
X_train_muk, X_test_muk, y_train_muk, y_test_muk = train_test_split(X_muk, y_muk, test_size=0.2, random_state=0)

In [None]:
regressor_muk = LinearRegression()  
regressor_muk.fit(X_train_muk, y_train_muk)

In [None]:
#To retrieve the intercept:
print(regressor_muk.intercept_)
#For retrieving the slope:
print(regressor_muk.coef_)

### How to get the calibrated value from the raw value,using the equation
### What are we comparing is it raw vs calibrated or BAM vs Calibrated

In [None]:
y_pred_muk = regressor_muk.predict(X_test_muk)

In [None]:
df_muk = pd.DataFrame({'Actual': y_test_muk.flatten(), 'Predicted': y_pred_muk.flatten()})
df_muk

In [None]:
df1 = df_muk.head(50)
df1.plot(kind='bar',figsize=(16,10)) 
title =  'Hourly Actual Vs Predicted PM2.5 Values for MUK 15th Jul - 26th Aug 2020'
plt.show()

In [None]:
plt.plot(X_test_muk, y_pred_muk, color='crimson', linewidth=2)
plt.scatter(X_test_muk, y_test_muk,  color='steelblue')
title =  'Hourly Actual Vs Predicted PM2.5 Values 15th Jul - 26th Aug 2020'
plt.show()

In [None]:
actual_predicted_correlations =df_muk.corr()
correlations = df_muk['Actual'].corr(df_muk['Predicted'])
all_features_corr_plot = sns.heatmap(actual_predicted_correlations, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_muk, y_pred_muk))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_muk, y_pred_muk))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_muk, y_pred_muk)))

# # Linear Regression model - Scikit-Learn - Daily# 

In [None]:
# take only rows where hourly_PM is not null
daily_combined_dataset = daily_combined_dataset[ daily_combined_dataset['muk_lowcost_daily_PM'].notna()]
daily_combined_dataset = daily_combined_dataset[daily_combined_dataset['muk_bam_daily_PM'].notna()]
daily_combined_dataset['muk_bam_daily_PM'].shape

In [None]:
X_daily = daily_combined_dataset['muk_bam_daily_PM'].values
X_daily = X_daily.reshape((-1, 1))
y_daily = daily_combined_dataset['muk_lowcost_daily_PM'].values
X_daily.shape[0]
y_daily.shape[0]

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(y_daily)

In [None]:
X_train_daily, X_test_daily, y_train_daily, y_test_daily = train_test_split(X_daily, y_daily, test_size=0.2, random_state=0)

In [None]:
regressor_daily = LinearRegression()  
regressor_daily.fit(X_train_daily, y_train_daily) #training the algorithm

In [None]:
#To retrieve the intercept:
print(regressor_daily.intercept_)
#For retrieving the slope:
print(regressor_daily.coef_)

In [None]:
y_pred_daily = regressor_daily.predict(X_test_daily)

In [None]:
df_daily = pd.DataFrame({'Actual': y_test_daily.flatten(), 'Predicted': y_pred_daily.flatten()})
df_daily

In [None]:
df1_daily = df_daily
df1_daily.plot(kind='bar',figsize=(16,10)) 
plt.show()

In [None]:
plt.plot(X_test_daily, y_pred_daily, color='crimson', linewidth=2)
plt.scatter(X_test_daily, y_test_daily,  color='steelblue')
plt.show()

In [None]:
residuals = y_test_daily-y_pred_daily
plt.scatter(residuals,y_pred_daily)
plt.show()

In [None]:
actual_predicted_correlations =df_daily.corr()
correlations = df_daily['Actual'].corr(df_daily['Predicted'])
all_features_corr_plot = sns.heatmap(actual_predicted_correlations, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_daily, y_pred_daily))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_daily, y_pred_daily))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_daily, y_pred_daily)))

### saving and loading models

In [None]:
joblib.dump(regressor_muk, 'regressor_muk.pkl') 
regressor_loaded = joblib.load('regressor_muk.pkl')

In [None]:
joblib.dump(regressor_muk, 'regressor_daily.pkl') 
regressor_loaded = joblib.load('regressor_daily.pkl')

### Multiple linear regression

In [None]:
X_MLRx = hourly_combined_dataset[['muk_bam_hourly_PM','AT(C)','RH(%)']]
X_MLR_muk = hourly_combined_dataset[['muk_bam_hourly_PM','AT(C)','RH(%)']].values
y_MLR_muk = hourly_combined_dataset['muk_lowcost_hourly_PM'].values
hourly_combined_dataset['muk_lowcost_hourly_PM'].shape

In [None]:
X_train_MLR_muk, X_test_MLR_muk, y_train_MLR_muk, y_test_MLR_muk = train_test_split(X_MLR_muk, y_MLR_muk, test_size=0.2, random_state=0)

In [None]:
regressor_MLR_muk = LinearRegression()  
regressor_MLR_muk.fit(X_train_MLR_muk, y_train_MLR_muk)

In [None]:
intercept_df_muk = pd.DataFrame(regressor_MLR_muk.intercept_, X_MLRx.columns, columns=['Intercept'])  
intercept_df_muk

In [None]:
coeff_df_muk = pd.DataFrame(regressor_MLR_muk.coef_, X_MLRx.columns, columns=['Coefficient'])  
coeff_df_muk

In [None]:
y_pred_mlr_muk = regressor_MLR_muk.predict(X_test_MLR_muk)

In [None]:
df_mlr_muk = pd.DataFrame({'Actual': y_test_MLR_muk, 'Predicted': y_pred_mlr_muk})

In [None]:
# y_pred_mlr_muk

In [None]:
# plt.figure(figsize=(10,6))
# plt.plot(X_test_MLR_muk[:,0], y_pred_mlr_muk, color='crimson', linewidth=2)
# plt.scatter(X_test_MLR_muk[:,0], y_test_MLR_muk,  color='steelblue')
# title = 'Actual Vs Predicted PM2.5 Values 15th Jul - 26th Aug 2020'
# plt.title(title,fontsize=20)
# plt.show()


In [None]:
df1_mlr_muk  = df_mlr_muk .head(50)
# df1.plot(kind='bar',figsize=(14,6))
ax = df1_mlr_muk.plot.bar(figsize=(14,8))
title =  'Actual Vs Predicted PM2.5 Values 15th Jul - 26th Aug 2020'
plt.title(title,fontsize=20)
plt.show()

In [None]:
actual_predicted_correlationsx =df_mlr_muk.corr()
correlations_x= df_mlr_muk['Actual'].corr(df_mlr_muk['Predicted'])
all_features_corr_plot = sns.heatmap(actual_predicted_correlationsx, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_MLR_muk, y_pred_mlr_muk))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_MLR_muk, y_pred_mlr_muk))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_MLR_muk, y_pred_mlr_muk)))

### Multiple linear regression - Daily, MUK

In [None]:
X_MLRx_daily = daily_combined_dataset[['muk_bam_daily_PM','AT(C)','RH(%)']]
X_MLR_daily = daily_combined_dataset[['muk_bam_daily_PM','AT(C)','RH(%)']].values
y_MLR_daily = daily_combined_dataset['muk_lowcost_daily_PM'].values
daily_combined_dataset['muk_lowcost_daily_PM'].shape

In [None]:
X_train_MLR_daily, X_test_MLR_daily, y_train_MLR_daily, y_test_MLR_daily = train_test_split(X_MLR_daily, y_MLR_daily, test_size=0.2, random_state=0)

In [None]:
regressor_MLR_daily = LinearRegression()  
regressor_MLR_daily.fit(X_train_MLR_daily, y_train_MLR_daily)

In [None]:
intercept_df_daily = pd.DataFrame(regressor_MLR_daily.intercept_, X_MLRx_daily.columns, columns=['Intercept'])  
intercept_df_daily

In [None]:
coeff_df_daily = pd.DataFrame(regressor_MLR_daily.coef_, X_MLRx_daily.columns, columns=['Coefficient'])  
coeff_df_daily

In [None]:
y_pred_mlr_daily = regressor_MLR_daily.predict(X_test_MLR_daily)

In [None]:
df_mlr_daily = pd.DataFrame({'Actual': y_test_MLR_daily, 'Predicted': y_pred_mlr_daily})

In [None]:
df1_mlr_daily  = df_mlr_daily.head(50)
# df1.plot(kind='bar',figsize=(14,6))
ax = df1_mlr_daily.plot.bar(figsize=(14,8))
title =  'Actual Vs Predicted PM2.5 Values 15th Jul - 26th Aug 2020'
plt.title(title,fontsize=20)
plt.show()

In [None]:
actual_predicted_correlationsx =df_mlr_muk.corr()
correlations_x= df_mlr_muk['Actual'].corr(df_mlr_muk['Predicted'])
all_features_corr_plot = sns.heatmap(actual_predicted_correlationsx, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_MLR_muk, y_pred_mlr_muk))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_MLR_muk, y_pred_mlr_muk))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_MLR_muk, y_pred_mlr_muk)))

### Uncertainity - hourly

In [None]:
# def f(x):
#     return 1.20927538 * x + 8.516051947603252

In [None]:
# y_test_muk.shape

In [None]:
# xdata = X_test_muk.flatten()
# ydata = y_test_muk

In [None]:
# popt, pcov = curve_fit(func, xdata, ydata)
# popt

In [None]:
# plt.scatter(xdata, ydata, label='data')
# popt, pcov = curve_fit(func, xdata, ydata)
# popt
# plt.plot(xdata, func(xdata, *popt),
#          label="fit: a={}  , b={} ".format(popt[0],popt[1]))
# popt, pcov = curve_fit(func, xdata, ydata)
# popt
# plt.plot(xdata, func(xdata, *popt),
#          label="fit: a={}  , b={} ".format(popt[0],popt[1]))
# plt.xlabel('x')
# plt.ylabel('y')
# plt.legend()
# plt.show()

In [None]:
x = hourly_combined_dataset['muk_bam_hourly_PM'].values
y = hourly_combined_dataset['muk_lowcost_hourly_PM'].values
x.shape

In [None]:
n = len(y)

def f(x, a, b):
    return a * x + b

popt, pcov = curve_fit(f, x, y)

In [None]:
# retrieve parameter values
a = popt[0]
b = popt[1]
print('Optimal Values')
print('a: ' + str(a))
print('b: ' + str(b))

In [None]:
# compute r^2
r2 = 1.0-(sum((y-f(x,a,b))**2)/((n-1.0)*np.var(y,ddof=1)))
print('R^2: ' + str(r2))

# calculate parameter confidence interval
a,b = unc.correlated_values(popt, pcov)
print('Uncertainty')
print('a: ' + str(a))
print('b: ' + str(b))

In [None]:
# plot data
plt.scatter(x, y, s=3, label='Data')

In [None]:
# calculate regression confidence interval
px = np.linspace(0, 200, 500)
py = a*px+b
nom = unp.nominal_values(py)
std = unp.std_devs(py)

In [None]:
nom

In [None]:
def predband(x, xd, yd, p, func, conf=0.95):
    # x = requested points
    # xd = x data
    # yd = y data
    # p = parameters
    # func = function name
    alpha = 1.0 - conf    # significance
    N = xd.size          # data sample size
    var_n = len(p)  # number of parameters
    q = stats.t.ppf(1.0 - alpha / 2.0, N - var_n)
    se = np.sqrt(1. / (N - var_n) * \
                 np.sum((yd - func(xd, *p)) ** 2))
    # Auxiliary definitions
    sx = (x - xd.mean()) ** 2
    sxd = np.sum((xd - xd.mean()) ** 2)
    # Predicted values (best-fit model)
    yp = func(x, *p)
    # Prediction band
    dy = q * se * np.sqrt(1.0+ (1.0/N) + (sx/sxd))
    # Upper & lower prediction bands.
    lpb, upb = yp - dy, yp + dy
    return lpb, upb

lpb, upb = predband(px, x, y, popt, f, conf=0.95)

In [None]:
# plot data
plt.figure(figsize=(10,6))
plt.scatter(x, y, s=3, label='Data')
# plot the regression
plt.plot(px, nom, c='black', label='y=a x + b')

# uncertainty lines (95% confidence)
plt.plot(px, nom - 1.96 * std, c='steelblue',\
         label='95% Confidence Region')
plt.plot(px, nom + 1.96 * std, c='steelblue')
# prediction band (95% confidence)
plt.plot(px, lpb, 'k--',label='95% Prediction Band')
plt.plot(px, upb, 'k--')
plt.ylabel('y')
plt.xlabel('x')
plt.legend(loc='best')

# save and show figure
plt.savefig('regression.png')
plt.show()

## US embassy low cost device


### Setting index as timestamp

In [None]:
us_embassy_lowcost_data["TimeStamp"] = pd.to_datetime(us_embassy_lowcost_data["created_at"])
us_embassy_lowcost_data["TimeStamp"] = us_embassy_lowcost_data["TimeStamp"]+ datetime.timedelta(hours=3)
us_embassy_lowcost_data.drop_duplicates(subset ="TimeStamp", keep = 'first', inplace = True)
us_embassy_lowcost_data = us_embassy_lowcost_data.set_index('TimeStamp')
us_embassy_lowcost_data = us_embassy_lowcost_data.drop(['created_at'], axis=1)
us_embassy_lowcost_data.head()

### Filtering outliers

In [None]:
us_embassy_lowcost_data = us_embassy_lowcost_data[us_embassy_lowcost_data['Average_PM2.5_CF_1_ug/m3'] <= 500.4]
us_embassy_lowcost_data = us_embassy_lowcost_data[us_embassy_lowcost_data['Average_PM2.5_CF_1_ug/m3'] > 0]

### Get data range

In [None]:
us_embassy_lowcost_data = us_embassy_lowcost_data.loc['2020-02-01':'2020-03-14']

### Hourly concetrations for the US embassy low cost sensors

In [None]:
us_embassy_lowcost_hourly_std  = us_embassy_lowcost_data.resample('H').std().round(2)
us_embassy_lowcost_hourly_mean = us_embassy_lowcost_data.resample('H').mean().round(2)

In [None]:
fig = plt.figure(figsize=(14,6))
plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
plt.plot(us_embassy_lowcost_hourly_mean.index, 
        us_embassy_lowcost_hourly_mean['Average_PM2.5_CF_1_ug/m3'],linestyle='solid', label='PM2.5 Average Values(Low cost)') #marker='o', 

chart_title =  'Hourly PM2.5 Concentrations 01st Feb to 14th Mar 2020 - US embassy lowcost device'
plt.title(chart_title,fontsize=20)
plt.ylabel('PM2.5', fontsize=20)
plt.xlabel('Date', fontsize=20)
plt.xticks(rotation=0)
plt.legend()
plt.show()
plt.tight_layout()
fig.savefig('hourly_lowcost_us_embassy.png')

### Daily concetrations - US embassy lowcost device

In [None]:
us_embassy_lowcost_daily_std    = us_embassy_lowcost_data.resample('D').std().round(2)
us_embassy_lowcost_daily_mean   = us_embassy_lowcost_data.resample('D').mean().round(2)

In [None]:
fig = plt.figure(figsize=(14,6))
plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
plt.plot(us_embassy_lowcost_daily_mean.index, 
         us_embassy_lowcost_daily_mean['Average_PM2.5_CF_1_ug/m3'], marker='o', linestyle='solid', label='PM2.5 Average Values')

chart_title =  'Daily PM2.5 Concentration 01st Feb to 14 Mar 2020 - US embassy lowcost'
plt.title(chart_title,fontsize=20)
plt.ylabel('PM2.5 Concentration', fontsize=20)
plt.xlabel('Date', fontsize=20)
plt.xticks(rotation=0)
plt.legend()
plt.show()
fig.savefig(os.path.join('daily_lowcost_us_embassy.png'))

## US embassy BAM 

In [None]:
us_embassy_bam_data.columns

In [None]:
#print(airquality_data_copy.columns)
us_embassy_bam_data = us_embassy_bam_data.drop(['Site','Parameter','Year','Month','Day','Hour','AQI','AQI Category','NowCast Conc.','Conc. Unit','Duration','QC Name'], axis=1)
us_embassy_bam_data.head()

###  Set Timestamp as index

In [None]:
us_embassy_bam_data["TimeStamp"] = pd.to_datetime(us_embassy_bam_data["Date (LT)"])
us_embassy_bam_data.drop_duplicates(subset ="TimeStamp", keep = 'first', inplace = True)
us_embassy_bam_data = us_embassy_bam_data.set_index('TimeStamp')
us_embassy_bam_data = us_embassy_bam_data.drop(['Date (LT)'], axis=1)
us_embassy_bam_data.tail()

### Filter outliers

In [None]:
us_embassy_bam_data = us_embassy_bam_data[us_embassy_bam_data['Raw Conc.'] <= 500.4]
us_embassy_bam_data = us_embassy_bam_data[us_embassy_bam_data['Raw Conc.'] > 0]

### Getting upper and lower date boundaries

In [None]:
## get the lower boundary date from the daily airquality dataset
us_embassy_lowcost_hourly_mean.iloc[:1,:]
t= us_embassy_lowcost_hourly_mean.iloc[:1,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
us_embassy_lowcost_hourly_mean_lower_date = z.strftime('%Y-%m-%d %H:%M:%S')

print(us_embassy_lowcost_hourly_mean_lower_date)

## get the upper boundary date from the daily airquality dataset
t = us_embassy_lowcost_hourly_mean.iloc[-1:,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
us_embassy_lowcost_hourly_mean_upper_date = z.strftime('%Y-%m-%d %H:%M:%S')
print(us_embassy_lowcost_hourly_mean_upper_date)

In [None]:
## get the lower boundary date from the daily airquality dataset
us_embassy_bam_data.iloc[:1,:]
t= us_embassy_bam_data.iloc[:1,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
us_embassy_bam_data_lower_date = z.strftime('%Y-%m-%d %H:%M:%S')

print(us_embassy_bam_data_lower_date)

## get the upper boundary date from the daily airquality dataset
t = us_embassy_bam_data.iloc[-1:,:].index.values[0]
t = t.astype(datetime.datetime)
z= pd.to_datetime(t)
us_embassy_bam_data_upper_date = z.strftime('%Y-%m-%d %H:%M:%S')
print(us_embassy_bam_data_upper_date)

In [None]:
hourly_same_range_us = us_embassy_bam_data[us_embassy_lowcost_hourly_mean_lower_date:us_embassy_lowcost_hourly_mean_upper_date]
hourly_same_range_us.head()

In [None]:
hourly_same_range_us.shape

### Add Time column for merging the two datasets

In [None]:
us_embassy_lowcost_hourly_mean  = us_embassy_lowcost_hourly_mean.drop(
['Sensor1 PM10_CF_1_ug/m3', 'Sensor2 PM10_CF_1_ug/m3',
       'Average_PM10_CF_1_ug/m3', ], axis=1)
hourly_timestamp = us_embassy_lowcost_hourly_mean.index.values
us_embassy_lowcost_hourly_mean["Time"] = hourly_timestamp
us_embassy_lowcost_hourly_mean["Time"] = pd.to_datetime(us_embassy_lowcost_hourly_mean["Time"])
us_embassy_lowcost_hourly_mean.head()

In [None]:
hourly_timestamp = hourly_same_range_us.index.values
#print(len(hourly_timestamp))
hourly_same_range_us["Time"] = hourly_timestamp
#print(hourly_meteorology_data.columns)
hourly_same_range_us["Time"] = pd.to_datetime(hourly_same_range_us["Time"])

hourly_same_range_us.head()

In [None]:
hourly_combined_dataset_us_embassy = pd.merge(us_embassy_lowcost_hourly_mean, hourly_same_range_us, on='Time')
#hourly_combined_dataset = hourly_combined_dataset.dropna()
#hourly_combined_dataset= hourly_combined_dataset.set_index('TimeStamp')

hourly_combined_dataset_us_embassy.rename(
    columns={'Average_PM2.5_CF_1_ug/m3':'lowcost_PM2.5','Raw Conc.':'bam_PM2.5',
                                     },inplace=True)

hourly_combined_dataset_us_embassy.to_csv('hourly_US_embassy_lowcost.csv')
 
hourly_combined_dataset_us_embassy.head()

In [None]:
# hourly_combined_dataset_us_embassy['bam_PM2.5'] = hourly_combined_dataset_us_embassy['bam_PM2.5'].shift(-1)

In [None]:
fig = plt.figure(figsize=(14,6))
plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
plt.plot(hourly_combined_dataset_us_embassy.Time, 
         hourly_combined_dataset_us_embassy['lowcost_PM2.5'],linestyle='solid', label='PM2.5 Average Values(Low cost)') #marker='o', 
plt.plot(hourly_combined_dataset_us_embassy.Time, 
         hourly_combined_dataset_us_embassy['bam_PM2.5'], linestyle='solid', label='PM2.5 Values(BAM)') # marker='x',

chart_title =  'Lowcost Vs US Embassy BAM PM2.5 Measurements'
plt.title(chart_title,fontsize=20)
plt.ylabel('PM2.5', fontsize=20)
plt.xlabel('Date', fontsize=20)
plt.xticks(rotation=0)
plt.legend()
plt.tight_layout()
plt.show()
fig.savefig('hourly_US_lowcost')

### Daily concetrations - Us embassy

In [None]:
us_embassy_lowcost_daily_std    = us_embassy_lowcost_data.resample('D').std().round(2)
us_embassy_lowcost_daily_mean   = us_embassy_lowcost_data.resample('D').mean().round(2)

### Add Time column for merging the two datasets

In [None]:
us_embassy_lowcost_daily_mean  = us_embassy_lowcost_daily_mean.drop(
['Sensor1 PM10_CF_1_ug/m3','Sensor2 PM10_CF_1_ug/m3','Average_PM10_CF_1_ug/m3', ], axis=1)
daily_timestamp = us_embassy_lowcost_daily_mean.index.values
us_embassy_lowcost_daily_mean["Time"] = daily_timestamp
us_embassy_lowcost_daily_mean["Time"] = pd.to_datetime(us_embassy_lowcost_daily_mean["Time"])

us_embassy_lowcost_daily_mean.head()

In [None]:
daily_timestamp = us_embassy_bam_data.index.values
us_embassy_bam_data["Time"] = daily_timestamp
us_embassy_bam_data["Time"] = pd.to_datetime(us_embassy_bam_data["Time"])

us_embassy_bam_data.head()

In [None]:
daily_combined_dataset_us= pd.merge(us_embassy_lowcost_daily_mean, us_embassy_bam_data, on='Time')
daily_combined_dataset_us.rename(
    columns={'Average_PM2.5_CF_1_ug/m3':'lowcost_daily_PM2.5','Raw Conc.':'bam_daily_PM2.5',
                                     },inplace=True)
daily_combined_dataset_us.to_csv('Daily_US_PMS.csv')
 
daily_combined_dataset_us.head()

In [None]:
fig = plt.figure(figsize=(14,6))
plt.rcParams.update({'font.size':14, 'font.weight':'bold'})
plt.plot(daily_combined_dataset_us.Time, 
         daily_combined_dataset_us['lowcost_daily_PM2.5'], linestyle='solid', label='PM2.5 Average Values(Low cost)') #marker='o', 
plt.plot(daily_combined_dataset_us.Time, 
         daily_combined_dataset_us['bam_daily_PM2.5'], linestyle='solid', label='PM2.5 Values(BAM)') # marker='x',


chart_title =  'low cost Vs BAM PM2.5 Daily Measurements - US Embassy'
plt.title(chart_title,fontsize=20)
plt.ylabel('PM2.5', fontsize=20)
plt.xlabel('Date', fontsize=20)
plt.xticks(rotation=0)
plt.legend()
plt.tight_layout()
plt.show()
fig.savefig('daily_US_PMS.png')

## Linear Regression model - Scikit-Learn - Hourly

In [None]:
hourly_combined_dataset_us_embassy = hourly_combined_dataset_us_embassy[hourly_combined_dataset_us_embassy['lowcost_PM2.5'].notna()]
hourly_combined_dataset_us_embassy = hourly_combined_dataset_us_embassy[hourly_combined_dataset_us_embassy['bam_PM2.5'].notna()]
print(hourly_combined_dataset_us_embassy['lowcost_PM2.5'])

In [None]:
X_us_embassy = hourly_combined_dataset_us_embassy['bam_PM2.5'].values
X_us_embassy = X_us_embassy.reshape((-1, 1))
y_us_embassy = hourly_combined_dataset_us_embassy['lowcost_PM2.5'].values
print(X_us_embassy)
print(y_us_embassy)

In [None]:
plt.figure(figsize=(15,10))
plt.tight_layout()
seabornInstance.distplot(y_us_embassy)

In [None]:
X_train_us_embassy, X_test_us_embassy, y_train_us_embassy, y_test_us_embassy = train_test_split(X_us_embassy, y_us_embassy, test_size=0.2, random_state=0)

In [None]:
regressor_us_embassy = LinearRegression()  
regressor_us_embassy.fit(X_train_us_embassy, y_train_us_embassy)

In [None]:
#To retrieve the intercept:
print(regressor_us_embassy.intercept_)
#For retrieving the slope:
print(regressor_us_embassy.coef_)

In [None]:
y_pred_us_embassy = regressor_us_embassy.predict(X_test_us_embassy)

In [None]:
df_us_embassy = pd.DataFrame({'Actual': y_test_us_embassy.flatten(), 'Predicted': y_pred_us_embassy.flatten()})
df_us_embassy

In [None]:
df1_us_embassy = df_us_embassy.head(25)
df1_us_embassy.plot(kind='bar',figsize=(16,10)) 
plt.show()

In [None]:
plt.scatter(X_test_us_embassy, y_test_us_embassy,  color='steelblue')
plt.plot(X_test_us_embassy, y_pred_us_embassy, color='crimson', linewidth=2)
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_us_embassy, y_pred_us_embassy))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test_us_embassy, y_pred_us_embassy))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_us_embassy, y_pred_us_embassy)))

In [None]:
correlations_us_embassy =df_us_embassy.corr()
correlations_us = df_us_embassy['Actual'].corr(df_us_embassy['Predicted'])
all_features_corr = sns.heatmap(correlations_us_embassy, annot=True, cmap=plt.cm.Reds)
plt.show()

### Uncertainity - Hourly 

In [None]:
# xx = hourly_combined_dataset_us_embassy['bam_PM2.5'].values
# yy = hourly_combined_dataset_us_embassy['lowcost_PM2.5'].values
# xx.shape

In [None]:
# n = len(yy)

# def f(x, a, b):
#     return a * x + b

# popt, pcov = curve_fit(f, xx, yy)

In [None]:
# # retrieve parameter values
# a = popt[0]
# b = popt[1]
# print('Optimal Values')
# print('a: ' + str(a))
# print('b: ' + str(b))

In [None]:
# # compute r^2
# r2 = 1.0-(sum((y-f(x,a,b))**2)/((n-1.0)*np.var(y,ddof=1)))
# print('R^2: ' + str(r2))

# # calculate parameter confidence interval
# a,b = unc.correlated_values(popt, pcov)
# print('Uncertainty')
# print('a: ' + str(a))
# print('b: ' + str(b))

In [None]:
# # plot data
# plt.scatter(x, y, s=3, label='Data')

In [None]:
# # calculate regression confidence interval
# px = np.linspace(0, 250, 1000)
# py = a*px+b
# nom = unp.nominal_values(py)
# std = unp.std_devs(py)

In [None]:
# def predband(x, xd, yd, p, func, conf=0.95):
#     # x = requested points
#     # xd = x data
#     # yd = y data
#     # p = parameters
#     # func = function name
#     alpha = 1.0 - conf    # significance
#     N = xd.size          # data sample size
#     var_n = len(p)  # number of parameters
#     # Quantile of Student's t distribution for p=(1-alpha/2)
#     q = stats.t.ppf(1.0 - alpha / 2.0, N - var_n)
#     # Stdev of an individual measurement
#     se = np.sqrt(1. / (N - var_n) * \
#                  np.sum((yd - func(xd, *p)) ** 2))
#     # Auxiliary definitions
#     sx = (x - xd.mean()) ** 2
#     sxd = np.sum((xd - xd.mean()) ** 2)
#     # Predicted values (best-fit model)
#     yp = func(x, *p)
#     # Prediction band
#     dy = q * se * np.sqrt(1.0+ (1.0/N) + (sx/sxd))
#     # Upper & lower prediction bands.
#     lpb, upb = yp - dy, yp + dy
#     return lpb, upb

# lpb, upb = predband(px, x, y, popt, f, conf=0.95)

In [None]:
# # plot data
# plt.figure(figsize=(10,6))
# plt.scatter(x, y, s=3, label='Data')
# # plot the regression
# plt.plot(px, nom, c='black', label='y=a x + b')

# # uncertainty lines (95% confidence)
# plt.plot(px, nom - 1.96 * std, c='steelblue',\
#          label='95% Confidence Region')
# plt.plot(px, nom + 1.96 * std, c='steelblue')
# # prediction band (95% confidence)
# plt.plot(px, lpb, 'k--',label='95% Prediction Band')
# plt.plot(px, upb, 'k--')
# plt.ylabel('y')
# plt.xlabel('x')
# plt.legend(loc='best')

# # save and show figure
# plt.savefig('regression.png')
# plt.show()

### Prediction of low cost MUK concetrations using US embassy model - hourly

In [None]:
prediction_muk = 1.3441848663111386 + 0.79561658 * y_us_embassy # prediction = regressor.intercept_ + regressor.coef_ * raw

In [None]:
df_predicted_muk = pd.DataFrame({'Actual': y_us_embassy.flatten(), 'Predicted': prediction_muk.flatten()})

In [None]:
df_predicted_muk = df_predicted_muk.head(50)
df_predicted_muk .plot(kind='bar',figsize=(16,10)) 

plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error( y_us_embassy, prediction_muk))  
print('Mean Squared Error:', metrics.mean_squared_error(y_us_embassy, prediction_muk))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_us_embassy, prediction_muk)))

### Prediction of low cost US embassy concetrations using MUK model - hourly

In [None]:
prediction_us_embassy = 8.516051947603252 + 1.20927538 * y_muk # prediction = regressor.intercept_ + regressor.coef_ * raw

In [None]:
df_predicted_us_embassy = pd.DataFrame({'Actual': y_muk.flatten(), 'Predicted': prediction_us_embassy.flatten()})

In [None]:
df_predicted_us_embassy = df_predicted_us_embassy.head(50)
df_predicted_us_embassy .plot(kind='bar',figsize=(16,10)) 
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(X_muk, prediction_us_embassy))  
print('Mean Squared Error:', metrics.mean_squared_error(X_muk, prediction_us_embassy))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(X_muk, prediction_us_embassy)))