In [1]:
import pandas as pd 

from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

df = pd.read_csv("goemotions_1.csv")

In [2]:
df.columns

Index(['text', 'id', 'author', 'subreddit', 'link_id', 'parent_id',
       'created_utc', 'rater_id', 'example_very_unclear', 'admiration',
       'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion',
       'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust',
       'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy',
       'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief',
       'remorse', 'sadness', 'surprise', 'neutral'],
      dtype='object')

In [3]:
pd.set_option('display.max_colwidth', None)

df[['text', 'excitement']].loc[lambda d: d['excitement'] == 0].sample(2)

Unnamed: 0,text,excitement
16171,Huh. So it really comes down to looking at her senate career vs her AG career.,0
909,"If you kill a spider instead of putting it outside, you are a grade A bitchmade babyman.",0


In [4]:
df['excitement'].value_counts()

0    68100
1     1900
Name: excitement, dtype: int64

In [5]:
X, y = df['text'], df['excitement']

pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

In [6]:
%%time 

pipe.fit(X, y)

CPU times: user 8.32 s, sys: 22.7 s, total: 31 s
Wall time: 3.21 s


Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

## Trick 1: Model Uncertainty

In [7]:
pipe.predict_proba(X)

array([[0.81905624, 0.18094376],
       [0.87339587, 0.12660413],
       [0.99887526, 0.00112474],
       ...,
       [0.95765091, 0.04234909],
       [0.89402035, 0.10597965],
       [0.97989268, 0.02010732]])

In [50]:
# make predictions 
probas = pipe.predict_proba(X)[:, 0] 

# use predictions in hindsight
(df
 .loc[(probas > 0.45) & (probas < 0.55)]
 [['text', 'excitement']]
 .head(7))

Unnamed: 0,text,excitement
8,that's adorable asf,0
46,"If there’s a pattern, yes.",0
107,My fans on patreon will be rewarded soon,0
154,"Ones with close ties to SA, anyway. An escaped apostate won't exactly be itching to run home.",0
158,I really like this ring so I’m glad to hear that.,0
262,OMG THOSE TINY SHOES! *desire to boop snoot intensifies*,0
362,This. I relate to this. So much. Almost too much.,0


<br><br><br><br><br><br><br><br><br><br><br><br><br><br>

## Trick 2: Model Disagreement

In [10]:
df.loc[lambda d: d['excitement'] != pipe.predict(X)].shape

(5315, 37)

In [11]:
def correct_class_confidence(X, y, mod):
    """
    Gives the predicted confidence (or proba) associated
    with the correct label `y` from a given model.
    """
    probas = mod.predict_proba(X)
    values = []
    for i, proba in enumerate(probas):
        proba_dict = {mod.classes_[j]: v for j, v in enumerate(proba)}
        values.append(proba_dict[y[i]])
    return values

In [12]:
(df
 .assign(confidence=correct_class_confidence(X, y, pipe))
 .loc[lambda d: pipe.predict(d['text']) != d['excitement']]
 [['text', 'excitement', 'confidence']]
 .sort_values("confidence")
 .loc[lambda d: d['excitement'] == 0]
 .head(20))

Unnamed: 0,text,excitement,confidence
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0,0.000148
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0,0.000187
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0,0.000262
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0,0.000594
20823,"Wow, your posting history is a real... interesting ride.",0,0.000719
69395,"Wow, your posting history is a real... interesting ride.",0,0.000719
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0,0.000741
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0,0.000813
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0,0.001128


## Trick 3: Cleanlab Noise Indices

In [20]:
from cleanlab.pruning import get_noise_indices

ordered_label_errors = get_noise_indices(
    s=y,
    psx=pipe.predict_proba(X),
    sorted_index_method='prob_given_label',
 )

In [24]:
df.iloc[ordered_label_errors][['text', 'excitement']].head(20)

Unnamed: 0,text,excitement
5676,I am inexplicably excited by [NAME]. I get so excited by how he curls passes,0
28707,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
42757,Omg this is so amazing ! Keep up the awesome work and have a fantastic New Year !,0
24756,Sounds like a fun game. Our home game around here is .05/.10. Its fun but not very exciting.,0
44459,So no replays for arsenal penalty calls.. Cool cool cool cool cool cool cool cool,0
20823,"Wow, your posting history is a real... interesting ride.",0
69395,"Wow, your posting history is a real... interesting ride.",0
2001,No different than people making a big deal about their team winning the super bowl. People find it interesting.,0
30921,"Hey congrats!! That's amazing, you've done such amazing progress! Hope you have a great day :)",0
39475,"I just read your list and now I can't wait, either!! Hurry up with the happy, relieved and peaceful onward and upward!! Congratulations😎",0


## Trick 4: Cleanlab Predictions

In [31]:
from cleanlab.classification import LearningWithNoisyLabels
from sklearn.linear_model import LogisticRegression

# Wrap around any classifier that has `sample_weights`.
fresh_pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)
lnl = LearningWithNoisyLabels(clf=fresh_pipe)

lnl.fit(X=X, s=y.values)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [39]:
new_pipe = make_pipeline(
    CountVectorizer(), 
    LogisticRegression(class_weight='balanced', max_iter=1000)
)

new_pipe.fit(X=X, y=y)

Pipeline(steps=[('countvectorizer', CountVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight='balanced', max_iter=1000))])

In [45]:
df.loc[lnl.predict(X) != new_pipe.predict(X)][['text', 'excitement']].sample(5)

Unnamed: 0,text,excitement
9366,"Those are the [NAME], they sing backup for [NAME]. They’re awesome!",0
57511,Ooh I thoughy today was last day. Damn it I spent 10 hrs playing HvV today😑.,0
39613,Oh my [NAME]. The hand mixer instead of forks. Shredding poultry is like my least favorite thing on Earth,0
41001,Same. It's my favorite show to watch in the morning. The whole crew is so likeable.,0
21352,Interested in Carlton stuff.,0
