In [43]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import utils

DATA_PATH = "../data/final_pca_dataset.csv"
LABEL_PATH = "../data/sustainability_w_regions.csv"
LABEL_COLS = ["Country Name", "Year", "region", "sub-region"]

In [8]:
data = pd.read_csv(DATA_PATH)
labels = pd.read_csv(LABEL_PATH, usecols=LABEL_COLS)
labels = (
    labels.sort_values(by=["Country Name", "Year"])
    .drop_duplicates(subset="Country Name", keep="last")
    .reset_index(drop=True)
)

In [9]:
data.head(3)

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA17,PCA18,PCA19,PCA20,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26
0,-0.643805,2.191136,0.321808,-1.441855,0.17122,0.149858,-0.471543,-0.206151,-0.060449,0.342254,...,-0.186154,0.606148,-0.352419,0.535856,-0.562325,-0.357243,0.17139,0.092316,0.249222,-0.075066
1,-1.299059,-1.662512,2.966026,-1.47772,1.362689,-0.444011,0.486824,-0.045,-0.117723,-1.032082,...,0.289182,-1.16164,0.132648,0.682834,-0.442228,-0.555521,0.166413,0.002465,-0.102565,-0.001728
2,4.710351,-1.789952,-1.155978,0.597963,2.05544,-1.476613,0.893746,-0.615902,-0.899434,0.717401,...,-0.320642,-0.087188,0.267852,0.364075,-0.853258,-0.222514,-0.51928,-0.291666,-0.978907,0.64445


In [10]:
labels.head(3)

Unnamed: 0,Country Name,Year,region,sub-region
0,Albania,2018,Europe,Southern Europe
1,Algeria,2018,Africa,Northern Africa
2,Angola,2018,Africa,Sub-Saharan Africa


Encode region names as integers

In [33]:
labels['region'].unique()

array(['Europe', 'Africa', 'Americas', 'Asia', 'Oceania'], dtype=object)

In [34]:
labels["sub-region"].unique()

array(['Southern Europe', 'Northern Africa', 'Sub-Saharan Africa',
       'Latin America and the Caribbean', 'Western Asia',
       'Australia and New Zealand', 'Western Europe', 'Southern Asia',
       'Eastern Europe', 'South-eastern Asia', 'Northern America',
       'Eastern Asia', 'Northern Europe', 'Melanesia', 'Central Asia',
       'Polynesia'], dtype=object)

In [59]:
TARGETS = ["region"]
TARGET_MAP = {
    "region": {"Europe": 1, "Africa": 2, "Americas": 3, "Asia": 4, "Oceania": 5},
    "sub-region": {
        "Southern Europe": 1,
        "Northern Africa": 2,
        "Sub-Saharan Africa": 3,
        "Latin America and the Caribbean": 4,
        "Western Asia": 5,
        "Southern Asia": 6,
        "Eastern Europe": 7,
        "South-eastern Asia": 8,
        "Northern America": 9,
        "Eastern Asia": 10,
        "Northern Europe": 11,
        "Melanesia": 12,
        "Central Asia": 13,
        "Polynesia": 14,
        "Australia and New Zealand": 15,
        "Western Europe": 16,
    },
}

for target in TARGETS:
    X = data
    y = labels[target].replace(TARGET_MAP[target])
    y = pd.get_dummies(y, columns=target, drop_first=True)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    linear_model = LinearRegression()
    linear_model.fit(X_train, y_train)
    predictions = linear_model.predict(X_test)
    # converted_pred = pd.DataFrame(predictions).replace(
    #    {v: k for k, v in TARGET_MAP[target].items()}
    # )
    predicted_countries = predictions.argmax(axis=1)

    linear_mse = mean_squared_error(y_test, predictions)
    print(f"Linear Regression MSE: {linear_mse}")
    print(f"R^2 {1 - (linear_mse / np.var(y))}")
    # print(utils.viz_regression(y_test, linear_predictions, y_test))

Linear Regression MSE: 0.11534232198649047
R^2 2    0.417075
3    0.269556
4    0.417075
5   -2.445190
dtype: float64


  return var(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [5]:
data = data.merge(labels, how='outer', left_index=True, right_index=True)

In [6]:
data.head(3)

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA8,PCA9,PCA10,...,PCA21,PCA22,PCA23,PCA24,PCA25,PCA26,Country Name,Year,region,sub-region
0,-0.643805,2.191136,0.321808,-1.441855,0.17122,0.149858,-0.471543,-0.206151,-0.060449,0.342254,...,-0.562325,-0.357243,0.17139,0.092316,0.249222,-0.075066,Albania,2018,Europe,Southern Europe
1,-1.299059,-1.662512,2.966026,-1.47772,1.362689,-0.444011,0.486824,-0.045,-0.117723,-1.032082,...,-0.442228,-0.555521,0.166413,0.002465,-0.102565,-0.001728,Algeria,2018,Africa,Northern Africa
2,4.710351,-1.789952,-1.155978,0.597963,2.05544,-1.476613,0.893746,-0.615902,-0.899434,0.717401,...,-0.853258,-0.222514,-0.51928,-0.291666,-0.978907,0.64445,Angola,2018,Africa,Sub-Saharan Africa
