In [1]:
!pip3 install tune-sklearn "ray[tune]"
!pip3 install cloudpickle imbalanced-learn scikit-optimize

You should consider upgrading via the '/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/bin/python3 -m pip install --upgrade pip' command.[0m


You should consider upgrading via the '/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import sys
import os
import math
sys.path.append('/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction')

from utils.get_data_loader import SustainBenchTextDataset
from sklearn.linear_model import Ridge, LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import time # Just to compare fit times
from sklearn.metrics import r2_score, roc_auc_score, classification_report

In [3]:
classification_cutoff_dict = {'asset_index': 0, 'sanitation_index': 3, 'water_index': 3, 'women_edu': 5}
TARGETS = ['asset_index', 'sanitation_index', 'water_index', 'women_edu']
FEATURE_TYPES = ['target_sentence', 'document', 'target_sentence_document']

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC


def evaluate_hyperparameter_optimization_regression(X_train, y_train, X_test, y_test):
    clf = Ridge()
    
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()

    y_pred = clf.predict(X_test)

    print(f'baseline fit time: {end - start}, r^2: {round(r2_score(y_test, y_pred), 5)}')

    
    parameters = {
       'alpha': np.logspace(-2,2,50)
    }
    # n_jobs=-1 enables use of all cores like Tune does
    sklearn_search = GridSearchCV(
       Ridge(),
       parameters,
       n_jobs=-1
    )

    start = time.time()
    sklearn_search.fit(X_train, y_train)
    end = time.time()
    y_pred = sklearn_search.predict(X_test)
    
    print(f'sklearn GridSearchCV fit time: {end - start}, r^2: {round(r2_score(y_test, y_pred), 5)}')
    print(sklearn_search.best_estimator_)
    print()

    
def evaluate_hyperparameter_optimization_classification(X_train, y_train, X_test, y_test, use_smote=True):
    if use_smote:
        oversample = SMOTE()
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    for model in ['svm', 'lr', 'rf']:
        print(model)
        if model == 'svm':
            base_estimator = Pipeline(steps=[("scaler", MaxAbsScaler()), ("svm", SVC(gamma='scale'))])
            param_grid = {'svm__kernel': ('linear', 'rbf'),'svm__C': np.logspace(-1,2,20)}
            sklearn_search = HalvingGridSearchCV(
                base_estimator, 
                param_grid, 
                cv=3,
                factor=2,
                max_resources=100,
                scoring='accuracy',
                error_score=0
            )
        elif model == 'lr':
            base_estimator = Pipeline(steps=[("scaler", MaxAbsScaler()), ("lr", LogisticRegression(max_iter=500))])
            param_grid = {'lr__C': np.logspace(-1,2,20)}
            sklearn_search = HalvingGridSearchCV(
                base_estimator, 
                param_grid, 
                cv=3,
                factor=2,
                max_resources=100,
                scoring='accuracy',
                error_score=0
            )
        else:
            base_estimator = RandomForestClassifier(random_state=0)
            param_grid = {'max_depth': [3, 5, 10], 'min_samples_split': [2, 5, 10]}
            sklearn_search = HalvingGridSearchCV(
                base_estimator, 
                param_grid, 
                cv=3,
                factor=2,
                resource='n_estimators',
                max_resources=100,
                scoring='accuracy',
                error_score=0
            )

        start = time.time()
        sklearn_search.fit(X_train, y_train)
        end = time.time()
        y_pred = sklearn_search.predict(X_test)

        print(f'sklearn HalvingGridSearchCV fit time: {end - start}, roc auc: {round(roc_auc_score(y_test, y_pred), 5)}')
        print(sklearn_search.best_estimator_)
        print(classification_report(y_test, y_pred))

        print()



In [5]:
for target in TARGETS:
    for feature_type in [
        'target_sentence', 
#         'document', 
        'target_sentence_document'
    ]:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature_type=feature_type, 
            target=target,
            model_type='classification',
            classification_cutoff=classification_cutoff_dict[target]
        )

        print(target, feature_type)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(X_train.shape, X_test.shape)

        evaluate_hyperparameter_optimization_classification(X_train, y_train, X_test, y_test) 

asset_index target_sentence
(2017, 384) (401, 384)
svm
sklearn HalvingGridSearchCV fit time: 0.8768939971923828, roc auc: 0.73741
Pipeline(steps=[('scaler', MaxAbsScaler()), ('svm', SVC(C=1.8329807108324356))])
              precision    recall  f1-score   support

           0       0.82      0.70      0.75       237
           1       0.64      0.77      0.70       164

    accuracy                           0.73       401
   macro avg       0.73      0.74      0.73       401
weighted avg       0.75      0.73      0.73       401


lr
sklearn HalvingGridSearchCV fit time: 0.8643078804016113, roc auc: 0.6603
Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr', LogisticRegression(C=33.59818286283781, max_iter=500))])
              precision    recall  f1-score   support

           0       0.81      0.49      0.61       237
           1       0.53      0.84      0.65       164

    accuracy                           0.63       401
   macro avg       0.67      0.66      0.6

sklearn HalvingGridSearchCV fit time: 0.9061110019683838, roc auc: 0.62241
Pipeline(steps=[('scaler', MaxAbsScaler()),
                ('lr',
                 LogisticRegression(C=0.14384498882876628, max_iter=500))])
              precision    recall  f1-score   support

           0       0.83      0.58      0.68       467
           1       0.36      0.67      0.47       165

    accuracy                           0.60       632
   macro avg       0.59      0.62      0.57       632
weighted avg       0.71      0.60      0.63       632


rf
sklearn HalvingGridSearchCV fit time: 68.34827589988708, roc auc: 0.54021
RandomForestClassifier(max_depth=10, min_samples_split=5, n_estimators=96,
                       random_state=0)
              precision    recall  f1-score   support

           0       0.76      0.85      0.80       467
           1       0.35      0.23      0.28       165

    accuracy                           0.69       632
   macro avg       0.55      0.54      0.54  

In [21]:
for target in TARGETS:
    for feature_type in [
        'target_sentence', 
#         'document', 
        'target_sentence_document'
    ]:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature_type=feature_type, 
            target=target,
            model_type='regression',
            classification_cutoff=classification_cutoff_dict[target]
        )

        print(target, feature_type)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(X_train.shape, X_test.shape)
        
        evaluate_hyperparameter_optimization_regression(X_train, y_train, X_test, y_test) 

asset_index target_sentence
(2017, 384) (401, 384)
baseline fit time: 0.025072097778320312, r^2: 0.39316
sklearn GridSearchCV fit time: 1.3284289836883545, r^2: 0.3891
Ridge(alpha=1.0985411419875584)

asset_index target_sentence_document
(2017, 684) (401, 684)
baseline fit time: 0.036002159118652344, r^2: 0.33216
sklearn GridSearchCV fit time: 3.220874071121216, r^2: 0.13692
Ridge(alpha=100.0)

sanitation_index target_sentence
(2619, 384) (620, 384)
baseline fit time: 0.01630687713623047, r^2: 0.21119
sklearn GridSearchCV fit time: 1.4730379581451416, r^2: 0.20939
Ridge(alpha=1.0985411419875584)

sanitation_index target_sentence_document
(2619, 684) (620, 684)
baseline fit time: 0.04382205009460449, r^2: 0.17058
sklearn GridSearchCV fit time: 4.094425916671753, r^2: 0.15565
Ridge(alpha=1.5998587196060574)

water_index target_sentence
(3214, 384) (632, 384)
baseline fit time: 0.020295143127441406, r^2: 0.09664
sklearn GridSearchCV fit time: 1.7417972087860107, r^2: 0.10047
Ridge(alpha=1