## Part 1: Preprocessing of the Database
Since the provided dataset is not "ready-to-use", it needs to be handled so it could be used further safely. 

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)


df_or = pd.read_excel('Cities.xls', index_col=0, skipinitialspace=True) # Read with excel index. 
# Skip all white-spaces.

df = df_or.set_index('cityID') # Remove the indices and use cityID as index.

# Drop unnecessary columns (that explicitly imply each other)
# cityID implies City, and Country
# clusterID implies Typology
# Pollution Index is restricted to be used in prediction.
col2drop = ['Country', 'City', 'Typology', 'Pollution Index ']


df.drop(columns=col2drop, inplace=True)# Typology <=> clusterID, City <=> cityID, 

In [2]:
df = df.replace(r'^\s*$', np.NaN, regex=True)

n_NaN = df.isna().sum().sum()
print(n_NaN) # As you can see the total number of NaNs in the database 3199, so we need to impute.

# Impute all NaNs over the database.

df_int = df.select_dtypes(include='integer')
df_float = df.select_dtypes(include='float')

df_int = df_int.fillna(df.mode().iloc[0]) # Try using mean/mod for different columns.
df_float = df_float.fillna(df.mean().iloc[0]) # Try using mean/mod for different columns.

df[df_int.columns.values.tolist()] = df_int # Use mod for integers,
df[df_float.columns.values.tolist()] = df_float # Use mean for floats


3040


In [3]:
# Normalize the database:

# df_train = df.iloc[0:tr_slice_ind]
# df_test = df.iloc[tr_slice_ind::]



# tr_mean = tr_inputs.mean() # min/max normalization
# tr_min_ = tr_inputs.min()
# tr_max_ = tr_inputs.max()

# tr_inputs = (tr_inputs-tr_mean)/(tr_max_-tr_min_)

df_targets = df['CO2 Emissions per Capita (metric tonnes)']
df_inputs = df.drop(columns=['CO2 Emissions per Capita (metric tonnes)'])

df_in_mean = df_inputs.mean()
df_in_min = df_inputs.min()
df_in_max = df_inputs.max()


df_in = (df_inputs - df_in_mean)/(df_in_max - df_in_min)

train_perc = 0.75
tr_slice_ind = int(len(df)*0.75)+1

tr_target = df_targets[0:tr_slice_ind]
tr_inputs = df_in.iloc[0:tr_slice_ind]

ts_target = df_targets[tr_slice_ind::]
ts_inputs = df_in.iloc[tr_slice_ind::]

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=10)
comps = pca.fit_transform(tr_inputs)

p_df = pd.DataFrame(comps)

len(pca.components_)

10

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score


# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 100, num = 20)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [3, 5, 8]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]


regr = RandomForestRegressor()

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

regr = RandomizedSearchCV(estimator = regr, 
                               param_distributions = random_grid, 
                               n_iter = 100, cv = 3, verbose=2, 



from sklearn.neural_network import MLPRegressor

clf = MLPRegressor(hidden_layer_sizes=(8, 32, 16), 
                   activation='relu',
                   solver = 'adam',
                   alpha = 1e-1,
                   learning_rate='adaptive',
                   learning_rate_init=2.25e-5,
                   max_iter=35000,
                   random_state = 42,
                   shuffle=False).fit(p_df, tr_target)

pca_test = pca.transform(ts_inputs)
p_test = pd.DataFrame(pca_test)

est_out = clf.predict(p_test)

R_2 = r2_score(ts_target, est_out)

R_2

0.6041078218919904

In [6]:
clf.get_params()

{'activation': 'relu',
 'alpha': 0.1,
 'batch_size': 'auto',
 'beta_1': 0.9,
 'beta_2': 0.999,
 'early_stopping': False,
 'epsilon': 1e-08,
 'hidden_layer_sizes': (8, 32, 16),
 'learning_rate': 'adaptive',
 'learning_rate_init': 2.25e-05,
 'max_fun': 15000,
 'max_iter': 35000,
 'momentum': 0.9,
 'n_iter_no_change': 10,
 'nesterovs_momentum': True,
 'power_t': 0.5,
 'random_state': 42,
 'shuffle': False,
 'solver': 'adam',
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': False,
 'warm_start': False}