In [1]:
!pip3 install tune-sklearn "ray[tune]"
!pip3 install cloudpickle imbalanced-learn

You should consider upgrading via the '/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/bin/python3 -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/venvs/sustainbench/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import sys
import os
import math
sys.path.append('/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction')

from utils.get_data_loader import SustainBenchTextDataset
from sklearn.linear_model import Ridge, LogisticRegression
import numpy as np
from tune_sklearn import TuneGridSearchCV
import time # Just to compare fit times
from sklearn.metrics import r2_score, classification_report

In [3]:
classification_cutoff_dict = {'asset_index': 0, 'sanitation_index': 3, 'water_index': 3, 'women_edu': 5}
TARGETS = ['asset_index', 'sanitation_index', 'water_index', 'women_edu']
FEATURE_TYPES = ['target_sentence', 'document', 'target_sentence_document']

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.exceptions import ConvergenceWarning
from imblearn.over_sampling import SMOTE

def evaluate_hyperparameter_optimization_regression(X_train, y_train, X_test, y_test):
    clf = Ridge()
    
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()

    y_pred = clf.predict(X_test)

    print(f'baseline fit time: {end - start}, score: {round(r2_score(y_test, y_pred), 5)}')

    
    parameters = {
       'alpha': np.logspace(-2,1,20)
    }
    # n_jobs=-1 enables use of all cores like Tune does
    sklearn_search = GridSearchCV(
       Ridge(),
       parameters,
       n_jobs=-1
    )

    start = time.time()
    sklearn_search.fit(X_train, y_train)
    end = time.time()
    y_pred = sklearn_search.predict(X_test)
    
    print(f'sklearn GridSearchCV fit time: {end - start}, score: {round(r2_score(y_test, y_pred), 5)}')
    print()

    
def evaluate_hyperparameter_optimization_classification(X_train, y_train, X_test, y_test, use_smote=True):

    if use_smote:
        oversample = SMOTE()
        X_train, y_train = oversample.fit_resample(X_train, y_train)

    clf = LogisticRegression()
    
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()

    y_pred = clf.predict(X_test)

    print(f'baseline fit time: {end - start}')
    print(classification_report(y_test, y_pred))
    
    pipe = Pipeline(steps=[("scaler", MaxAbsScaler()), ("logistic", LogisticRegression(max_iter=500))])
    parameters = {
        "logistic__C": np.logspace(0,2,10)
    }
    
    # n_jobs=-1 enables use of all cores like Tune does
    sklearn_search = GridSearchCV(
       pipe,
       parameters,
       n_jobs=-1
    )

    start = time.time()
    sklearn_search.fit(X_train, y_train)
    end = time.time()
    y_pred = sklearn_search.predict(X_test)
    
    print(f'sklearn GridSearchCV fit time: {end - start}')
    print(classification_report(y_test, y_pred))

    print()


In [5]:
for target in TARGETS:
    for feature_type in FEATURE_TYPES:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature_type=feature_type, 
            target=target,
            model_type='regression',
            classification_cutoff=classification_cutoff_dict[target]
        )

        print(target, feature_type)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(X_train.shape, X_test.shape)
        
        evaluate_hyperparameter_optimization_regression(X_train, y_train, X_test, y_test) 

asset_index target_sentence
(1944, 384) (390, 384)
baseline fit time: 0.017325162887573242, score: 0.14537
sklearn GridSearchCV fit time: 1.3264899253845215, score: 0.11894

asset_index document
(25200, 300) (7821, 300)
baseline fit time: 0.06153297424316406, score: -0.09273
sklearn GridSearchCV fit time: 2.04484486579895, score: -0.09275

asset_index target_sentence_document
(1944, 684) (390, 684)
baseline fit time: 0.0319671630859375, score: 0.16005
sklearn GridSearchCV fit time: 1.5049638748168945, score: 0.15871

sanitation_index target_sentence
(2452, 384) (597, 384)
baseline fit time: 0.015386104583740234, score: -0.11944
sklearn GridSearchCV fit time: 0.6899731159210205, score: -0.15576

sanitation_index document
(25213, 300) (9496, 300)
baseline fit time: 0.06171083450317383, score: -0.13138
sklearn GridSearchCV fit time: 2.1331188678741455, score: -0.13139

sanitation_index target_sentence_document
(2452, 684) (597, 684)
baseline fit time: 0.042877912521362305, score: 0.03846


In [6]:
for target in TARGETS:
    for feature_type in FEATURE_TYPES:
        ds = SustainBenchTextDataset(
            data_dir=f'/Users/caravanuden/git-repos/Multimodal-deep-learning-for-poverty-prediction/data/', 
            feature_type=feature_type, 
            target=target,
            model_type='classification',
            classification_cutoff=classification_cutoff_dict[target]
        )

        print(target, feature_type)
        X_train, y_train = ds.get_data('train')
        X_test, y_test = ds.get_data('test')
        print(X_train.shape, X_test.shape)

        evaluate_hyperparameter_optimization_classification(X_train, y_train, X_test, y_test) 

asset_index target_sentence
(1944, 384) (390, 384)
baseline fit time: 0.05393409729003906
              precision    recall  f1-score   support

           0       0.74      0.46      0.56       235
           1       0.48      0.76      0.59       155

    accuracy                           0.58       390
   macro avg       0.61      0.61      0.58       390
weighted avg       0.64      0.58      0.57       390



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

sklearn GridSearchCV fit time: 5.8797290325164795
              precision    recall  f1-score   support

           0       0.69      0.52      0.59       235
           1       0.47      0.65      0.55       155

    accuracy                           0.57       390
   macro avg       0.58      0.59      0.57       390
weighted avg       0.61      0.57      0.58       390


asset_index document
(25200, 300) (7821, 300)
baseline fit time: 0.47937798500061035
              precision    recall  f1-score   support

           0       0.74      0.29      0.42      4742
           1       0.44      0.85      0.58      3079

    accuracy                           0.51      7821
   macro avg       0.59      0.57      0.50      7821
weighted avg       0.62      0.51      0.48      7821

sklearn GridSearchCV fit time: 10.936953067779541
              precision    recall  f1-score   support

           0       0.74      0.29      0.42      4742
           1       0.44      0.85      0.58      30

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


sklearn GridSearchCV fit time: 8.586397886276245
              precision    recall  f1-score   support

           0       0.61      0.59      0.60       235
           1       0.41      0.43      0.42       155

    accuracy                           0.53       390
   macro avg       0.51      0.51      0.51       390
weighted avg       0.53      0.53      0.53       390


sanitation_index target_sentence
(2452, 384) (597, 384)
baseline fit time: 0.0653529167175293
              precision    recall  f1-score   support

           0       0.53      0.49      0.51       275
           1       0.59      0.63      0.61       322

    accuracy                           0.56       597
   macro avg       0.56      0.56      0.56       597
weighted avg       0.56      0.56      0.56       597



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

sklearn GridSearchCV fit time: 7.579280853271484
              precision    recall  f1-score   support

           0       0.47      0.58      0.52       275
           1       0.55      0.44      0.49       322

    accuracy                           0.51       597
   macro avg       0.51      0.51      0.51       597
weighted avg       0.52      0.51      0.51       597


sanitation_index document
(25213, 300) (9496, 300)
baseline fit time: 0.49903368949890137
              precision    recall  f1-score   support

           0       0.58      0.30      0.39      4818
           1       0.52      0.78      0.62      4678

    accuracy                           0.53      9496
   macro avg       0.55      0.54      0.51      9496
weighted avg       0.55      0.53      0.51      9496

sklearn GridSearchCV fit time: 9.086345911026001
              precision    recall  f1-score   support

           0       0.58      0.30      0.39      4818
           1       0.52      0.78      0.62     

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


baseline fit time: 0.23687982559204102
              precision    recall  f1-score   support

           0       0.58      0.45      0.50       275
           1       0.60      0.72      0.66       322

    accuracy                           0.59       597
   macro avg       0.59      0.58      0.58       597
weighted avg       0.59      0.59      0.59       597



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


sklearn GridSearchCV fit time: 13.925727128982544
              precision    recall  f1-score   support

           0       0.53      0.53      0.53       275
           1       0.60      0.61      0.60       322

    accuracy                           0.57       597
   macro avg       0.57      0.57      0.57       597
weighted avg       0.57      0.57      0.57       597


water_index target_sentence
(2513, 384) (528, 384)
baseline fit time: 0.06635689735412598
              precision    recall  f1-score   support

           0       0.74      0.63      0.68       384
           1       0.30      0.42      0.35       144

    accuracy                           0.57       528
   macro avg       0.52      0.53      0.52       528
weighted avg       0.62      0.57      0.59       528



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


sklearn GridSearchCV fit time: 9.566956043243408
              precision    recall  f1-score   support

           0       0.72      0.72      0.72       384
           1       0.25      0.25      0.25       144

    accuracy                           0.59       528
   macro avg       0.49      0.49      0.49       528
weighted avg       0.59      0.59      0.59       528


water_index document
(26660, 300) (7821, 300)
baseline fit time: 0.762444019317627
              precision    recall  f1-score   support

           0       0.86      0.34      0.49      6589
           1       0.16      0.69      0.27      1232

    accuracy                           0.40      7821
   macro avg       0.51      0.52      0.38      7821
weighted avg       0.75      0.40      0.45      7821

sklearn GridSearchCV fit time: 15.611576795578003
              precision    recall  f1-score   support

           0       0.86      0.34      0.49      6589
           1       0.16      0.70      0.27      1232


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


baseline fit time: 0.2470536231994629
              precision    recall  f1-score   support

           0       0.73      0.62      0.67       384
           1       0.28      0.40      0.33       144

    accuracy                           0.56       528
   macro avg       0.51      0.51      0.50       528
weighted avg       0.61      0.56      0.58       528



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

sklearn GridSearchCV fit time: 14.733853340148926
              precision    recall  f1-score   support

           0       0.74      0.75      0.75       384
           1       0.32      0.31      0.31       144

    accuracy                           0.63       528
   macro avg       0.53      0.53      0.53       528
weighted avg       0.63      0.63      0.63       528


women_edu target_sentence
(4136, 384) (1096, 384)
baseline fit time: 0.18065595626831055
              precision    recall  f1-score   support

           0       0.62      0.55      0.58       506
           1       0.65      0.71      0.68       590

    accuracy                           0.64      1096
   macro avg       0.64      0.63      0.63      1096
weighted avg       0.64      0.64      0.64      1096



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

sklearn GridSearchCV fit time: 16.330770015716553
              precision    recall  f1-score   support

           0       0.56      0.61      0.58       506
           1       0.64      0.58      0.61       590

    accuracy                           0.60      1096
   macro avg       0.60      0.60      0.60      1096
weighted avg       0.60      0.60      0.60      1096


women_edu document
(51295, 300) (13139, 300)
baseline fit time: 1.1526851654052734
              precision    recall  f1-score   support

           0       0.48      0.46      0.47      5498
           1       0.62      0.65      0.64      7641

    accuracy                           0.57     13139
   macro avg       0.55      0.55      0.55     13139
weighted avg       0.56      0.57      0.57     13139

sklearn GridSearchCV fit time: 30.702750205993652
              precision    recall  f1-score   support

           0       0.48      0.46      0.47      5498
           1       0.62      0.65      0.64      7641

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


baseline fit time: 0.38657569885253906
              precision    recall  f1-score   support

           0       0.55      0.59      0.57       506
           1       0.63      0.59      0.61       590

    accuracy                           0.59      1096
   macro avg       0.59      0.59      0.59      1096
weighted avg       0.59      0.59      0.59      1096



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

sklearn GridSearchCV fit time: 26.121206045150757
              precision    recall  f1-score   support

           0       0.54      0.69      0.61       506
           1       0.65      0.49      0.56       590

    accuracy                           0.59      1096
   macro avg       0.60      0.59      0.58      1096
weighted avg       0.60      0.59      0.58      1096


