### 1. Get data ready

In [21]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("pollution-and-mortality.csv")
data.head()

Unnamed: 0,Country,Year,PM2.5,PM10,TotalDeaths,PMDeaths,CitiesCount
0,Albania,2015,21.79,32.415,42.299773,21.786089,2
1,Albania,2016,21.48,32.385,41.018788,20.810664,2
2,Argentina,2015,10.26,27.87,33.086924,29.159094,1
3,Australia,2010,8.04,15.323333,13.571713,13.14038,3
4,Australia,2011,7.316667,14.443333,13.727626,13.276676,3


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Country      333 non-null    object 
 1   Year         333 non-null    int64  
 2   PM2.5        333 non-null    float64
 3   PM10         333 non-null    float64
 4   TotalDeaths  333 non-null    float64
 5   PMDeaths     333 non-null    float64
 6   CitiesCount  333 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 18.3+ KB


In [4]:
data.describe()

Unnamed: 0,Year,PM2.5,PM10,TotalDeaths,PMDeaths,CitiesCount
count,333.0,333.0,333.0,333.0,333.0,333.0
mean,2014.18018,23.141074,45.414597,39.023378,26.177802,16.471471
std,2.196101,21.263261,46.694763,31.218198,14.313898,29.233531
min,2010.0,4.578462,6.03,8.401696,7.544358,1.0
25%,2013.0,11.256667,19.126667,18.114643,16.033366,2.0
50%,2014.0,16.46,27.8625,28.047017,22.430605,5.0
75%,2016.0,25.0,51.3,46.775579,33.505289,14.0
max,2017.0,132.0,395.333333,160.587443,73.723273,197.0


In [5]:
# Let's try to create a model that can predict the number of PMDeaths given the dataset above...
X = data.drop('PMDeaths', axis=1)
y = data['PMDeaths']

In [6]:
X.head(3)

Unnamed: 0,Country,Year,PM2.5,PM10,TotalDeaths,CitiesCount
0,Albania,2015,21.79,32.415,42.299773,2
1,Albania,2016,21.48,32.385,41.018788,2
2,Argentina,2015,10.26,27.87,33.086924,1


In [7]:
y.head(3)

0    21.786089
1    20.810664
2    29.159094
Name: PMDeaths, dtype: float64

In [8]:
# Country, Year and CitiesCount are all categorical and should be converted to numerical form...
# data['Year'].value_counts(), data['CitiesCount'].value_counts()

In [9]:
# Turn categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Country', 'Year', 'CitiesCount']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  categorical_features)],
                                  remainder='passthrough')
transformed_X = transformer.fit_transform(X)
pd.DataFrame(transformed_X)

Unnamed: 0,0
0,"(0, 0)\t1.0\n (0, 79)\t1.0\n (0, 83)\t1.0\..."
1,"(0, 0)\t1.0\n (0, 80)\t1.0\n (0, 83)\t1.0\..."
2,"(0, 1)\t1.0\n (0, 79)\t1.0\n (0, 82)\t1.0\..."
3,"(0, 2)\t1.0\n (0, 74)\t1.0\n (0, 84)\t1.0\..."
4,"(0, 2)\t1.0\n (0, 75)\t1.0\n (0, 84)\t1.0\..."
...,...
328,"(0, 72)\t1.0\n (0, 78)\t1.0\n (0, 115)\t1...."
329,"(0, 72)\t1.0\n (0, 79)\t1.0\n (0, 118)\t1...."
330,"(0, 72)\t1.0\n (0, 80)\t1.0\n (0, 115)\t1...."
331,"(0, 72)\t1.0\n (0, 81)\t1.0\n (0, 117)\t1...."


In [10]:
dummies = pd.get_dummies(data[['Country', 'Year', 'CitiesCount']])
dummies.head()

Unnamed: 0,Year,CitiesCount,Country_Albania,Country_Argentina,Country_Australia,Country_Austria,Country_Bahrain,Country_Bangladesh,Country_Belgium,Country_Bosnia and Herzegovina,...,Country_Spain,Country_Sweden,Country_Switzerland,Country_Thailand,Country_Trinidad and Tobago,Country_Turkey,Country_Uganda,Country_United Arab Emirates,Country_United Kingdom,Country_Uruguay
0,2015,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2016,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2015,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2010,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2011,3,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2. Choose model

In [11]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 3. Fit model to training data

In [23]:
np.random.seed(42)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9100855054609152

### 4. Evaluate model

In [25]:
y_preds = model.predict(X_test)
y_preds

array([48.12327653,  8.61286038, 29.54983865, 21.38650704, 20.51505876,
       45.11865002, 32.95165836, 33.04767513, 48.05651008, 34.48175607,
       28.05255604,  9.29611374, 28.89913865, 17.87244066, 31.11017124,
       43.70797235, 13.36365683, 60.84293661, 29.06489327, 35.93475216,
        8.65881876, 30.94005129, 11.53333203, 32.28712084, 11.26474787,
        8.7348445 , 54.89081151, 33.50792415, 34.77930764, 33.97679699,
       30.48647282, 13.30483733, 20.37468108, 28.80153813, 42.4043403 ,
       18.25684822, 30.90254219, 31.04792983, 42.61967601, 10.8627882 ,
       37.32291884, 41.3310097 , 29.01111618, 30.70800494,  7.89821163,
       29.88961635, 28.61616482, 43.03607006, 23.69585882, 12.70063902,
       32.12437312, 12.65679485, 16.40691378, 15.80177862, 11.16583986,
       37.55820393, 10.27152063, 22.08818606, 16.42875616, 33.43323107,
       47.02152752, 11.66928281, 35.81075125, 12.61986477, 25.10199953,
        8.49304346, 49.26936085])

In [27]:
model.score(X_train, y_train)

0.9770746635492066

In [29]:
model.score(X_test, y_test)

0.9100855054609152

### 5. Improve model

In [35]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators...')
    model = RandomForestRegressor(n_estimators=i).fit(X_train, y_train)
    print(f'Coefficent of determination R^2 of the models prediction: {model.score(X_test, y_test):.4f}')
    print('')

Trying model with 10 estimators...
Coefficent of determination R^2 of the models prediction: 0.9007

Trying model with 20 estimators...
Coefficent of determination R^2 of the models prediction: 0.8900

Trying model with 30 estimators...
Coefficent of determination R^2 of the models prediction: 0.9013

Trying model with 40 estimators...
Coefficent of determination R^2 of the models prediction: 0.8986

Trying model with 50 estimators...
Coefficent of determination R^2 of the models prediction: 0.9059

Trying model with 60 estimators...
Coefficent of determination R^2 of the models prediction: 0.9056

Trying model with 70 estimators...
Coefficent of determination R^2 of the models prediction: 0.9093

Trying model with 80 estimators...
Coefficent of determination R^2 of the models prediction: 0.9060

Trying model with 90 estimators...
Coefficent of determination R^2 of the models prediction: 0.9089

