In [1]:
# !pip3 install tune-sklearn "ray[tune]"
# !pip3 install cloudpickle imbalanced-learn scikit-optimize

In [3]:
import sys
import os
import math

from imblearn.over_sampling import SMOTE
import numpy as np
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import r2_score, roc_auc_score, classification_report
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.svm import SVC
import time

sys.path.append('/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction')
from utils.get_data_loader import SustainBenchTextDataset

In [4]:
CLASSIFICATION_THRESHOLD_DICT = {'asset_index': 0, 'sanitation_index': 3, 'water_index': 3, 'women_edu': 5}
TARGETS = ['asset_index', 'sanitation_index', 'water_index', 'women_edu']
FEATURES = ['target_sentence', 'document', 'target_sentence_document']

### Regression: ridge regression, random forest regressor

In [4]:
def regression(X_train, y_train, X_test, y_test):   
    base_estimator = KernelRidge(kernel='rbf')
    param_grid = {'kernel': ['linear', 'rbf'], 'alpha': np.logspace(-2,3,50)}
    sklearn_search = HalvingGridSearchCV(
        base_estimator, 
        param_grid, 
        cv=3,
        factor=2,
        resource='n_samples',
        error_score=-np.inf
    )

    start = time.time()
    sklearn_search.fit(X_train, y_train)
    end = time.time()
    y_pred = sklearn_search.predict(X_test)

    print(f'sklearn HalvingGridSearchCV fit time: {round(end - start, 3)}s, r^2: {round(r2_score(y_test, y_pred), 3)}')
    print(f'best estimator: {sklearn_search.best_estimator_}\n')

In [None]:
for target in TARGETS:
    for feature in FEATURES:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature=feature, 
            target=target,
            model_type='regression',
            classification_threshold=CLASSIFICATION_THRESHOLD_DICT[target]
        )

        print(target, feature)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(f'train data shape: {X_train.shape}, test data shape: {X_test.shape}')
        
        regression(X_train, y_train, X_test, y_test)
    print()

asset_index target_sentence
train data shape: (2017, 384), test data shape: (401, 384)
sklearn HalvingGridSearchCV fit time: 1.774s, r^2: 0.393
best estimator: KernelRidge(alpha=0.5428675439323859)

asset_index document
train data shape: (25200, 300), test data shape: (7821, 300)
sklearn HalvingGridSearchCV fit time: 378.59s, r^2: -0.001
best estimator: KernelRidge(alpha=1000.0)

asset_index target_sentence_document
train data shape: (2017, 684), test data shape: (401, 684)
sklearn HalvingGridSearchCV fit time: 2.326s, r^2: 0.161
best estimator: KernelRidge(alpha=1000.0)


sanitation_index target_sentence
train data shape: (2619, 384), test data shape: (620, 384)
sklearn HalvingGridSearchCV fit time: 3.163s, r^2: 0.193
best estimator: KernelRidge(alpha=0.01, kernel='rbf')

sanitation_index document
train data shape: (25213, 300), test data shape: (9496, 300)
sklearn HalvingGridSearchCV fit time: 391.626s, r^2: -0.878
best estimator: KernelRidge(alpha=244.205309454865)

sanitation_index

  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coef = linalg.solve(K, y, sym_pos=True, overwrite_a=False)
  dual_coe

### Classification: SVM, logistic regression, random forest classifier

In [1]:
def classification(X_train, y_train, X_test, y_test):
    base_estimator = Pipeline(steps=[("scaler", MaxAbsScaler()), ("lr", LogisticRegression(max_iter=500))])
    param_grid = {'lr__C': np.logspace(-1,2,20)}
    sklearn_search = HalvingGridSearchCV(
        base_estimator, 
        param_grid, 
        cv=3,
        factor=2,
        max_resources=100,
        error_score=0
    )

    start = time.time()
    sklearn_search.fit(X_train, y_train)
    end = time.time()
    y_pred = sklearn_search.predict(X_test)

    print(f'sklearn HalvingGridSearchCV fit time: {round(end - start, 3)}s, roc auc: {round(roc_auc_score(y_test, y_pred), 3)}')
    print(f'best estimator: {sklearn_search.best_estimator_}')
    print(classification_report(y_test, y_pred))

    print()

In [5]:
for target in TARGETS:
    for feature in FEATURES:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature=feature, 
            target=target,
            model_type='classification',
            classification_threshold=CLASSIFICATION_THRESHOLD_DICT[target]
        )

        print(target, feature)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(f'train data shape: {X_train.shape}, test data shape: {X_test.shape}\n')

        classification(X_train, y_train, X_test, y_test) 
        print()

asset_index target_sentence
train data shape: (2017, 384), test data shape: (401, 384)

sklearn HalvingGridSearchCV fit time: 0.475s, roc auc: 0.742
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.14384498882876628, max_iter=500))])
              precision    recall  f1-score   support

           0       0.78      0.82      0.80       237
           1       0.72      0.66      0.69       164

    accuracy                           0.76       401
   macro avg       0.75      0.74      0.74       401
weighted avg       0.75      0.76      0.75       401



asset_index document
train data shape: (25200, 300), test data shape: (7821, 300)

sklearn HalvingGridSearchCV fit time: 0.832s, roc auc: 0.508
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.29763514416313175, max_iter=500))])
              precision    recall  f1-score   support

     

20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 0.633s, roc auc: 0.682
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.14384498882876628, max_iter=500))])
              precision    recall  f1-score   support

           0       0.63      0.78      0.70       296
           1       0.75      0.58      0.65       324

    accuracy                           0.68       620
   macro avg       0.69      0.68      0.68       620
weighted avg       0.69      0.68      0.67       620



water_index target_sentence
train data shape: (3214, 384), test data shape: (632, 384)



20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 0.476s, roc auc: 0.515
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.20691380811147897, max_iter=500))])
              precision    recall  f1-score   support

           0       0.74      0.96      0.84       467
           1       0.38      0.07      0.12       165

    accuracy                           0.73       632
   macro avg       0.56      0.51      0.48       632
weighted avg       0.65      0.73      0.65       632



water_index document
train data shape: (26660, 300), test data shape: (7821, 300)



40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 0.702s, roc auc: 0.499
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr', LogisticRegression(C=0.8858667904100825, max_iter=500))])
              precision    recall  f1-score   support

           0       0.84      0.98      0.90      6589
           1       0.15      0.02      0.04      1232

    accuracy                           0.83      7821
   macro avg       0.50      0.50      0.47      7821
weighted avg       0.73      0.83      0.77      7821



water_index target_sentence_document
train data shape: (3214, 684), test data shape: (632, 684)



40 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 0.542s, roc auc: 0.535
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.29763514416313175, max_iter=500))])
              precision    recall  f1-score   support

           0       0.75      0.93      0.83       467
           1       0.42      0.14      0.21       165

    accuracy                           0.72       632
   macro avg       0.59      0.54      0.52       632
weighted avg       0.67      0.72      0.67       632



women_edu target_sentence
train data shape: (4338, 384), test data shape: (1395, 384)



20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 0.461s, roc auc: 0.601
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr', LogisticRegression(C=0.1, max_iter=500))])
              precision    recall  f1-score   support

           0       0.45      0.94      0.61       545
           1       0.88      0.26      0.40       850

    accuracy                           0.53      1395
   macro avg       0.66      0.60      0.50      1395
weighted avg       0.71      0.53      0.48      1395



women_edu document
train data shape: (51295, 300), test data shape: (13139, 300)



20 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to 0.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/lib/python3.9/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/l

sklearn HalvingGridSearchCV fit time: 1.359s, roc auc: 0.504
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.29763514416313175, max_iter=500))])
              precision    recall  f1-score   support

           0       0.42      0.96      0.58      5498
           1       0.62      0.05      0.09      7641

    accuracy                           0.43     13139
   macro avg       0.52      0.50      0.34     13139
weighted avg       0.54      0.43      0.30     13139



women_edu target_sentence_document
train data shape: (4338, 684), test data shape: (1395, 684)

sklearn HalvingGridSearchCV fit time: 0.802s, roc auc: 0.554
best estimator: Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.14384498882876628, max_iter=500))])
              precision    recall  f1-score   support

           0       0.42      0.94      0.58       545
           1       0.81   

### Classification: basic feedforward network

In [6]:
import torch
import torch.nn as nn
import torchvision.datasets as dsets
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader

In [7]:
class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Net, self).__init__()                    # Inherited from the parent class nn.Module
        self.fc1 = nn.Linear(input_size, hidden_size)  # 1st Full-Connected Layer: 784 (input data) -> 500 (hidden node)
        self.relu = nn.ReLU()                          # Non-Linear ReLU Layer: max(0,x)
        self.fc2 = nn.Linear(hidden_size, num_classes) # 2nd Full-Connected Layer: 500 (hidden node) -> 10 (output class)
    
    def forward(self, x):                              # Forward pass: stacking each layer together
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [8]:
def get_data_loader(feature, target):
    ds = SustainBenchTextDataset(
                data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
                feature=feature, 
                target=target,
                model_type='classification',
                classification_threshold=CLASSIFICATION_THRESHOLD_DICT[target],
                use_smote=False
            )

    X_train, y_train = ds.get_data('train')

    X_train = torch.Tensor(X_train)
    y_train = torch.from_numpy(y_train)

    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset)

    X_test, y_test = ds.get_data('test')

    X_test = torch.Tensor(X_test)
    y_test = torch.from_numpy(y_test)

    test_dataset = TensorDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset)
    
    return train_loader, test_loader

In [9]:
def train(train_loader, net, learning_rate, logging=False):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for i, (embeddings, labels) in enumerate(train_loader):
            embeddings = Variable(embeddings)
            labels = Variable(labels)

            optimizer.zero_grad()                  
            outputs = net(embeddings)                   
            loss = criterion(outputs, labels)        
            loss.backward()                            
            optimizer.step() 

            if logging and (i+1) % 100 == 0: 
                print(f'epoch: {epoch+1}/{num_epochs}, step: {int((i+1) / 100)}/{len(train_dataset)//batch_size}, loss: {loss.data}')

    return net

In [10]:
def evaluate(test_loader, net, time_taken):
    y_true = []
    y_pred = []
    for i, (embeddings, labels) in enumerate(test_loader):
        embeddings = Variable(embeddings)
        outputs = net(embeddings)
        _, predicted = torch.max(outputs.data, 1)
        y_true.append(labels.item())
        y_pred.append(predicted.item())

    print(f'basic neural network classifier fit time: {round(time_taken, 3)}s, roc auc: {round(roc_auc_score(y_true, y_pred), 3)}')
    print(classification_report(y_true, y_pred))

In [11]:
hidden_size = 100
num_classes = 2
num_epochs = 5
batch_size = 100
learning_rate = 0.001

In [12]:
FEATURE_INPUT_SIZE_DICT = {'target_sentence': 384, 'document': 300, 'target_sentence_document': 684}
for target in TARGETS:
    for feature in FEATURES:
        print(target, feature)
        input_size = FEATURE_INPUT_SIZE_DICT[feature]

        train_loader, test_loader = get_data_loader(feature, target)
        net = Net(input_size, hidden_size, num_classes)
        start = time.time()
        net = train(train_loader, net, learning_rate)
        end = time.time()
        evaluate(test_loader, net, end - start)
        print()

asset_index target_sentence
basic neural network classifier fit time: 5.431s, roc auc: 0.715
              precision    recall  f1-score   support

           0       0.90      0.51      0.65       237
           1       0.57      0.91      0.70       164

    accuracy                           0.68       401
   macro avg       0.73      0.71      0.68       401
weighted avg       0.76      0.68      0.67       401


asset_index document
basic neural network classifier fit time: 56.376s, roc auc: 0.525
              precision    recall  f1-score   support

           0       0.63      0.54      0.58      4742
           1       0.42      0.51      0.46      3079

    accuracy                           0.53      7821
   macro avg       0.52      0.53      0.52      7821
weighted avg       0.55      0.53      0.53      7821


asset_index target_sentence_document
basic neural network classifier fit time: 6.761s, roc auc: 0.661
              precision    recall  f1-score   support

       