In [25]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# Import data
df_mean = pd.read_csv("pm25_mean.csv")  


df_mean


Unnamed: 0,Country,Continent,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,Afghanistan,Asia,68.97,66.94,68.26,72.18,68.06,67.20,64.00,61.78,70.25,62.49
1,Albania,Europe,21.95,23.00,21.70,19.77,19.13,18.98,18.15,19.01,19.76,16.28
2,Algeria,Africa,21.27,21.56,22.76,21.19,22.64,22.49,23.06,23.33,22.73,22.68
3,Andorra,Europe,11.43,11.66,10.80,10.15,9.35,9.94,9.06,9.31,9.21,8.52
4,Angola,Africa,24.20,24.67,24.01,24.53,24.12,25.03,26.08,25.44,25.85,27.16
...,...,...,...,...,...,...,...,...,...,...,...,...
190,Venezuela (Bolivarian Republic of),America,20.67,20.71,19.87,21.00,19.84,20.12,19.65,18.71,17.06,16.21
191,Viet Nam,Asia,22.37,20.67,21.67,22.62,20.08,20.47,22.37,20.10,20.09,20.89
192,Yemen,Asia,38.83,41.03,42.49,41.02,38.27,41.06,39.91,41.32,41.62,41.61
193,Zambia,Africa,17.09,17.33,17.55,17.24,17.29,17.43,17.32,16.99,16.99,16.90


In [26]:
# Select all countries from 2010-2019
X = df_mean.iloc[:, 2:10]
y = df_mean.iloc[:, 11:12]


# Splitting into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

# Predict using Linear Regression
model_mean = LinearRegression()
model_mean.fit(x_train, y_train)

predict_2019 = model_mean.predict(x_test)

predict_2019

array([[18.36964342],
       [13.77129507],
       [15.29228188],
       [44.30810351],
       [20.92287077],
       [ 6.30507563],
       [ 5.71997653],
       [ 9.28603172],
       [17.92532375],
       [32.61266659],
       [14.85973382],
       [12.67018593],
       [10.65158057],
       [43.30061775],
       [40.13333984],
       [18.46083064],
       [40.518629  ],
       [ 7.12172199],
       [11.28127402],
       [ 8.95245788],
       [ 8.26645106],
       [21.69592888],
       [14.23649221],
       [17.50919074],
       [11.74642481],
       [18.88570503],
       [10.3115785 ],
       [12.32804461],
       [ 7.58044376],
       [36.09122955],
       [10.04883617],
       [42.87556939],
       [15.0997752 ],
       [49.53745937],
       [21.99547675],
       [60.46580273],
       [15.905424  ],
       [26.2155525 ],
       [24.6338537 ]])

In [27]:
# Testing the error of the model

mse_2019 = mean_squared_error(y_test, predict_2019)
mse_2019
# mse = 1.12, pretty low, so pretty accurate

1.017754313996736

In [4]:
r2 = r2_score(y_test, predict_2019)
r2
# r2 = 0.99. Very close to 1, so pretty accurate

0.9943970953123386

### Using the linear regression model to predict the levels of each country in 2020-2023

In [33]:
# Training a linear regression model and looping through each country to predict levels for 2020-2023

years = df_mean.columns[2:11]
X = np.array(years).reshape(-1, 1)


countries = df_mean['Country'].unique()

for country in countries:
    y_country = df_mean.loc[df_mean['Country'] == country, years]
    y_country = np.array(y_country).reshape(-1,1)
    
    model.fit(X, y_country)
    
    df_mean.loc[df_mean['Country'] == country, '2020'] = round(model.predict(np.array([[2020]]))[0][0], 2)
    df_mean.loc[df_mean['Country'] == country, '2021'] = round(model.predict(np.array([[2021]]))[0][0], 2)
    df_mean.loc[df_mean['Country'] == country, '2022'] = round(model.predict(np.array([[2022]]))[0][0], 2)
    df_mean.loc[df_mean['Country'] == country, '2023'] = round(model.predict(np.array([[2023]]))[0][0], 2)



In [34]:
df_mean

Unnamed: 0,Country,Continent,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
0,Afghanistan,Asia,68.97,66.94,68.26,72.18,68.06,67.20,64.00,61.78,70.25,62.49,65.13,64.73,64.33,63.94
1,Albania,Europe,21.95,23.00,21.70,19.77,19.13,18.98,18.15,19.01,19.76,16.28,17.30,16.82,16.35,15.87
2,Algeria,Africa,21.27,21.56,22.76,21.19,22.64,22.49,23.06,23.33,22.73,22.68,23.64,23.86,24.08,24.29
3,Andorra,Europe,11.43,11.66,10.80,10.15,9.35,9.94,9.06,9.31,9.21,8.52,8.14,7.81,7.49,7.16
4,Angola,Africa,24.20,24.67,24.01,24.53,24.12,25.03,26.08,25.44,25.85,27.16,26.24,26.46,26.69,26.91
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,Venezuela (Bolivarian Republic of),America,20.67,20.71,19.87,21.00,19.84,20.12,19.65,18.71,17.06,16.21,17.56,17.20,16.84,16.47
191,Viet Nam,Asia,22.37,20.67,21.67,22.62,20.08,20.47,22.37,20.10,20.09,20.89,20.00,19.81,19.62,19.42
192,Yemen,Asia,38.83,41.03,42.49,41.02,38.27,41.06,39.91,41.32,41.62,41.61,41.31,41.42,41.54,41.65
193,Zambia,Africa,17.09,17.33,17.55,17.24,17.29,17.43,17.32,16.99,16.99,16.90,17.08,17.05,17.02,16.99


In [None]:
### Creating a dataframe and csv of just predictions for 2020-2023

df_predict = df_mean.iloc[:, [0, 1] + list(range(12, 16))]
df_predict

df_predict.to_csv('predictions_pm25.csv', index=False)



### Predicting when each country will reach WHO's Air Quality Guidnance
in progress


In [32]:
# Assuming you have loaded your dataset into df_mean
from sklearn.linear_model import LinearRegression

# Extract the years as a list from the column headers (columns 2 to 11)
years = df_mean.columns[2:11]
X = np.array(years).reshape(-1, 1)

# Loop through each country
countries = df_mean['Country'].unique()

for country in countries:
    # Extract the PM2.5 values for the current country
    y_country = df_mean.loc[df_mean['Country'] == country, years]
    y_country = np.array(y_country).reshape(-1,1)
    
    # Train the model
    model.fit(X, y_country)
    
    # Create hypothetical data for 5.5 PM2.5 value
    y_hypothetical = np.array([[5.5]])
    
    # Predict the year
    predicted_year = round(model.predict(y_hypothetical)[0][0], 2)
    
    print(f"The predicted year for {country} to reach a PM2.5 value of 5.5 is {predicted_year}")


The predicted year for Afghanistan to reach a PM2.5 value of 5.5 is 866.23
The predicted year for Albania to reach a PM2.5 value of 5.5 is 978.22
The predicted year for Algeria to reach a PM2.5 value of 5.5 is -414.51
The predicted year for Andorra to reach a PM2.5 value of 5.5 is 666.88
The predicted year for Angola to reach a PM2.5 value of 5.5 is -428.71
The predicted year for Antigua and Barbuda to reach a PM2.5 value of 5.5 is -312.17
The predicted year for Argentina to reach a PM2.5 value of 5.5 is 12.76
The predicted year for Armenia to reach a PM2.5 value of 5.5 is 1893.19
The predicted year for Australia to reach a PM2.5 value of 5.5 is 448.85
The predicted year for Austria to reach a PM2.5 value of 5.5 is 919.58
The predicted year for Azerbaijan to reach a PM2.5 value of 5.5 is 1472.31
The predicted year for Bahamas to reach a PM2.5 value of 5.5 is -195.53
The predicted year for Bahrain to reach a PM2.5 value of 5.5 is 216.39
The predicted year for Bangladesh to reach a PM2.5