# Effect of undersampling on logistic regression

In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from src.models.ordinal_regression import OrdinalClassifier
from src.utils.embeddings import retrieve_all_embeds
from src.utils.files import load_dfs

In [2]:
bin_clf = LogisticRegression(random_state=0, solver="lbfgs")

In [3]:
df_train, df_dev = load_dfs(["data/train_cleaned_final.csv", "data/dev_cleaned_final.csv"])
embed = retrieve_all_embeds([("data/features/use.pkl.train", "data/features/xception.pkl.train"), 
                              ("data/features/use.pkl.dev","data/features/xception.pkl.dev")])

In [4]:
y = df_train["Overall_sentiment"].cat.codes

In [5]:
y_dev = df_dev["Overall_sentiment"].cat.codes

In [6]:
y_labels = list(df_train["Overall_sentiment"].cat.categories.values)

In [7]:
y_codes = list(set(y.values))

In [8]:
def compute_ordinal_clf(clf, X, Y):
    clf_ord = OrdinalClassifier(clf)
    clf_ord.fit(X, Y)
    return clf_ord

In [9]:
def test_ordinal_clf(clf, X, Y, labels):
    pred_proba = clf.predict_proba(X)
    pred_cls = clf.predict(X)
    report = classification_report(Y, pred_cls, labels=labels)
    return {"pred_cls": pred_cls, "pred_proba": pred_proba, "report": report}

In [10]:
def print_res(res):
    for k,v in res.items():
        print(k)
        print(*[r["report_str"] for r in v])
        

In [11]:
def fit_test_OR(X_train, y_train, X_test, y_test, labels, clf):
    ord_clf = compute_ordinal_clf(clf, X_train, y_train)
    res = test_ordinal_clf(ord_clf, X_test, y_test, labels)
    print(res["report"])
    return res

In [12]:
from imblearn.under_sampling import InstanceHardnessThreshold

iht = InstanceHardnessThreshold(random_state=0, estimator=LogisticRegression(solver='lbfgs', multi_class='auto', max_iter=4000))

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = iht.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only
              precision    recall  f1-score   support

           0       0.07      0.34      0.12        80
           1       0.31      0.47      0.38       302
           2       0.58      0.17      0.26       618

    accuracy                           0.27      1000
   macro avg       0.32      0.33      0.25      1000
weighted avg       0.46      0.27      0.29      1000

text only
              precision    recall  f1-score   support

           0       0.12      0.30      0.17        80
           1       0.37      0.50      0.42       302
           2       0.67      0.42      0.52       618

    accuracy                           0.44      1000
   macro avg       0.39      0.41      0.37      1000
weighted avg       0.54      0.44      0.46      1000

concatenated
              precision    recall  f1-score   support

           0       0.09      0.24      0.13        80
           1       0.37      0.51      0.43       302
           2       0.65      0.39      0.

In [13]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0)

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = rus.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only
              precision    recall  f1-score   support

           0       0.06      0.21      0.09        80
           1       0.31      0.55      0.40       302
           2       0.65      0.18      0.28       618

    accuracy                           0.29      1000
   macro avg       0.34      0.31      0.26      1000
weighted avg       0.50      0.29      0.30      1000

text only
              precision    recall  f1-score   support

           0       0.13      0.44      0.20        80
           1       0.35      0.36      0.35       302
           2       0.65      0.44      0.52       618

    accuracy                           0.41      1000
   macro avg       0.38      0.41      0.36      1000
weighted avg       0.52      0.41      0.45      1000

concatenated
              precision    recall  f1-score   support

           0       0.12      0.41      0.19        80
           1       0.34      0.39      0.36       302
           2       0.66      0.42      0.

In [14]:
from imblearn.under_sampling import NearMiss
nm1 = NearMiss(version=1)

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = nm1.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes,bin_clf)

image only
              precision    recall  f1-score   support

           0       0.09      0.86      0.16        80
           1       0.34      0.11      0.17       302
           2       0.64      0.10      0.18       618

    accuracy                           0.17      1000
   macro avg       0.35      0.36      0.17      1000
weighted avg       0.50      0.17      0.17      1000

text only
              precision    recall  f1-score   support

           0       0.09      0.85      0.16        80
           1       0.32      0.13      0.18       302
           2       0.70      0.11      0.19       618

    accuracy                           0.18      1000
   macro avg       0.37      0.36      0.18      1000
weighted avg       0.54      0.18      0.19      1000

concatenated
              precision    recall  f1-score   support

           0       0.09      0.81      0.16        80
           1       0.34      0.19      0.24       302
           2       0.67      0.14      0.

In [15]:
nm2 = NearMiss(version=2)

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = nm2.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only
              precision    recall  f1-score   support

           0       0.07      0.38      0.12        80
           1       0.32      0.23      0.27       302
           2       0.62      0.36      0.46       618

    accuracy                           0.32      1000
   macro avg       0.34      0.32      0.28      1000
weighted avg       0.49      0.32      0.37      1000

text only
              precision    recall  f1-score   support

           0       0.09      0.68      0.16        80
           1       0.36      0.29      0.32       302
           2       0.66      0.17      0.28       618

    accuracy                           0.25      1000
   macro avg       0.37      0.38      0.25      1000
weighted avg       0.52      0.25      0.28      1000

concatenated
              precision    recall  f1-score   support

           0       0.08      0.45      0.14        80
           1       0.34      0.42      0.38       302
           2       0.68      0.22      0.

In [16]:
nm3 = NearMiss(version=3)

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = nm3.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only
              precision    recall  f1-score   support

           0       0.08      0.33      0.12        80
           1       0.37      0.19      0.25       302
           2       0.61      0.50      0.55       618

    accuracy                           0.39      1000
   macro avg       0.35      0.34      0.31      1000
weighted avg       0.50      0.39      0.42      1000

text only
              precision    recall  f1-score   support

           0       0.10      0.41      0.16        80
           1       0.37      0.45      0.41       302
           2       0.66      0.32      0.43       618

    accuracy                           0.37      1000
   macro avg       0.38      0.40      0.33      1000
weighted avg       0.53      0.37      0.41      1000

concatenated
              precision    recall  f1-score   support

           0       0.12      0.50      0.19        80
           1       0.36      0.38      0.37       302
           2       0.64      0.35      0.

In [17]:
from imblearn.under_sampling import EditedNearestNeighbours

enn = EditedNearestNeighbours()

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = enn.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.06      0.06      0.06        80
           1       0.00      0.00      0.00       302
           2       0.62      0.91      0.73       618

    accuracy                           0.57      1000
   macro avg       0.22      0.32      0.27      1000
weighted avg       0.39      0.57      0.46      1000

text only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.12      0.29      0.17        80
           1       0.00      0.00      0.00       302
           2       0.64      0.83      0.72       618

    accuracy                           0.53      1000
   macro avg       0.25      0.37      0.30      1000
weighted avg       0.40      0.53      0.46      1000

concatenated
              precision    recall  f1-score   support

           0       0.12      0.28      0.17        80
           1       0.00      0.00      0.00       302
           2       0.62      0.82      0.70       618

    accuracy                           0.53      1000
   macro avg       0.25      0.36      0.29      1000
weighted avg       0.39      0.53      0.45      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
from imblearn.under_sampling import RepeatedEditedNearestNeighbours

renn = RepeatedEditedNearestNeighbours()

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = renn.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.06      0.06      0.06        80
           1       0.00      0.00      0.00       302
           2       0.62      0.91      0.73       618

    accuracy                           0.57      1000
   macro avg       0.22      0.32      0.27      1000
weighted avg       0.39      0.57      0.46      1000

text only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.12      0.29      0.17        80
           1       0.00      0.00      0.00       302
           2       0.64      0.83      0.72       618

    accuracy                           0.53      1000
   macro avg       0.25      0.37      0.30      1000
weighted avg       0.40      0.53      0.46      1000

concatenated
              precision    recall  f1-score   support

           0       0.12      0.28      0.17        80
           1       0.00      0.00      0.00       302
           2       0.62      0.82      0.70       618

    accuracy                           0.53      1000
   macro avg       0.25      0.36      0.29      1000
weighted avg       0.39      0.53      0.45      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from imblearn.under_sampling import AllKNN

aknn = AllKNN()

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = aknn.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.61      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.61      0.47      1000

text only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.14      0.04      0.06        80
           1       0.00      0.00      0.00       302
           2       0.62      0.98      0.76       618

    accuracy                           0.61      1000
   macro avg       0.25      0.34      0.27      1000
weighted avg       0.39      0.61      0.47      1000

concatenated
              precision    recall  f1-score   support

           0       0.17      0.05      0.08        80
           1       0.00      0.00      0.00       302
           2       0.62      0.98      0.76       618

    accuracy                           0.61      1000
   macro avg       0.26      0.34      0.28      1000
weighted avg       0.40      0.61      0.48      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
from imblearn.under_sampling import OneSidedSelection

oss = OneSidedSelection()

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = oss.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       1.00      0.00      0.01       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.54      0.33      0.26      1000
weighted avg       0.68      0.62      0.47      1000

text only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.57      0.03      0.05       302
           2       0.62      0.99      0.76       618

    accuracy                           0.62      1000
   macro avg       0.40      0.34      0.27      1000
weighted avg       0.56      0.62      0.49      1000

concatenated
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.57      0.04      0.07       302
           2       0.62      0.99      0.76       618

    accuracy                           0.62      1000
   macro avg       0.40      0.34      0.28      1000
weighted avg       0.56      0.62      0.49      1000



  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

ncr = NeighbourhoodCleaningRule()

for item, (X_train, X_test) in embed.items():
    print(item)
    X_resampled, y_resampled = ncr.fit_resample(X_train, y)
    fit_test_OR(X_resampled, pd.Series(y_resampled), X_test, y_dev, y_codes, bin_clf)

image only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.00      0.00      0.00       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.21      0.33      0.25      1000
weighted avg       0.38      0.62      0.47      1000

text only


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        80
           1       0.50      0.01      0.01       302
           2       0.62      1.00      0.77       618

    accuracy                           0.62      1000
   macro avg       0.37      0.34      0.26      1000
weighted avg       0.53      0.62      0.48      1000

concatenated
              precision    recall  f1-score   support

           0       1.00      0.01      0.02        80
           1       1.00      0.00      0.01       302
           2       0.62      1.00      0.76       618

    accuracy                           0.62      1000
   macro avg       0.87      0.34      0.27      1000
weighted avg       0.76      0.62      0.48      1000

