# Bag of Words

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics
import scipy
from scipy.sparse import hstack
from sklearn.ensemble import GradientBoostingClassifier
from collections import OrderedDict
import shap
from tqdm import tqdm

In [2]:
notes = pd.read_table('../data/notes-00000.tsv')
ratings = pd.read_table('../data/ratings-00000.tsv')
tweets = pd.read_csv('../data/tweet_text2021-11-04.csv')

## Splitting Data

In [3]:
english_tweet_ids = tweets.loc[tweets['lang'] == 'en', 'id']
print('Filtering', len(tweets) - len(english_tweet_ids), 'out of', len(tweets),
      'tweets written in languages other than English')

Filtering 761 out of 12874 tweets written in languages other than English


In [4]:
filtered_notes = notes[notes['tweetId'].isin(english_tweet_ids)].reset_index(drop=True)

In [5]:
groups = filtered_notes['tweetId']
group_kfold = GroupKFold(n_splits=10) #1/10 -> 10% for val and testing
group_kfold.get_n_splits(X=filtered_notes, groups=groups)
train_notes_list = []
for i, (_, test_index) in enumerate(group_kfold.split(X=filtered_notes, groups=groups)):
    if i == 0:
        test_notes = filtered_notes.iloc[test_index, :]
    elif i == 1:
        val_notes = filtered_notes.iloc[test_index, :]
    else:
        train_notes_list.append(filtered_notes.iloc[test_index, :])
train_notes = pd.concat(train_notes_list)

In [6]:
print('Training set size:', train_notes.shape[0])
print('Validation set set:', val_notes.shape[0])
print('Testing set size:', test_notes.shape[0])

Training set size: 12688
Validation set set: 1586
Testing set size: 1587


### Generating outputs

In [7]:
def get_helful_out(row):
    if pd.isna(row['helpfulnessLevel']):
        return row['helpful']
    else:
        if row['helpfulnessLevel'] == 'NOT_HELPFUL':
            return 0
        else:
            return 1

ratings['helpful_out'] = (ratings
                          .apply(get_helful_out, axis=1))

In [8]:
ratings['helpful_out'].value_counts()

1    79311
0    42128
Name: helpful_out, dtype: int64

In [9]:
ratings

Unnamed: 0,noteId,participantId,createdAtMillis,version,agree,disagree,helpful,notHelpful,helpfulnessLevel,helpfulOther,...,notHelpfulSourcesMissingOrUnreliable,notHelpfulOpinionSpeculationOrBias,notHelpfulMissingKeyPoints,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrInflammatory,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse,notHelpfulIrrelevantSources,helpful_out
0,1352796878438424576,7644DF3FD853416F0C96933CCC1BA9B7,1611796572477,1,0,1,0,1,,0,...,0,0,0,0,0,0,0,1,0,0
1,1352796878438424576,7585B8804A32416E91E51837F351F249,1611388222120,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,1
2,1352796878438424576,EFD7E04E740224D2DDB42A2C910B62C1,1611852744990,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,1
3,1352796878438424576,0D7ED07D5421118311EEED5E4ECF2968,1611860252442,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,0,1
4,1352796878438424576,628C786C63B5A4D32E13C6C442E1863D,1611623203338,1,0,0,0,1,,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121434,1453148814378881026,6B6272A356D3E171B6011882DD599017,1635293115823,2,0,0,0,0,SOMEWHAT_HELPFUL,0,...,1,0,0,0,0,0,0,0,0,1
121435,1453148814378881026,ECC278CC105C774E3721FA2813319644,1635294425373,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,0,1
121436,1453148814378881026,A40A90FA8878D9EF7D5F600B655BFC1E,1635292872727,2,0,0,0,0,NOT_HELPFUL,0,...,0,0,0,0,0,0,0,0,0,0
121437,1453148814378881026,4D0B9ED0987B8F06D3A7AC25CDD03DB1,1635293671093,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
# for downweighting overreviewed notes
ratings['scaling'] = (ratings.groupby(['noteId'])
                      ['participantId']
                      .transform(lambda x: 1/x.nunique()))

### Featurization

In [11]:
datasets = OrderedDict([('train', {'notes': train_notes}),
                        ('val', {'notes': val_notes}),
                        ('test', {'notes': test_notes})])

In [12]:
feature_names = []
for name, data_dict in datasets.items():
    notes_df = data_dict['notes']
    notes_df = notes_df[~notes_df['summary'].isna()].reset_index(drop=True)

    if name == 'train':
        mono_vectorizer_summary = CountVectorizer(max_features=4000)
        mono_vectorizer_summary.fit(notes_df['summary'])
        feature_names.extend(mono_vectorizer_summary.get_feature_names_out())
    mono_X_summary = mono_vectorizer_summary.transform(notes_df['summary'])

    if name == 'train':
        bi_vectorizer_summary = CountVectorizer(max_features=4000, ngram_range=(2, 2))
        bi_vectorizer_summary.fit(notes_df['summary'])
        feature_names.extend(bi_vectorizer_summary.get_feature_names_out())
    bi_X_summary = bi_vectorizer_summary.transform(notes_df['summary'])

    sparse_X = hstack([mono_X_summary, bi_X_summary])
    X_df = pd.DataFrame(sparse_X.todense())
    X_df['noteId'] = notes_df['noteId']
    merged_X_df = X_df.merge(ratings[['noteId', 'helpful_out', 'scaling']],
                             how='inner', on='noteId')
    X = scipy.sparse.csr_matrix(merged_X_df.drop(['noteId', 'helpful_out', 'scaling'], axis=1))
    datasets[name]['X'] = X
    datasets[name]['Y'] = merged_X_df['helpful_out']
    datasets[name]['scaling'] = merged_X_df['scaling']
    datasets[name]['noteId'] = merged_X_df['noteId']
    print(name, 'size', merged_X_df.shape[0])

train size 87153
val size 12582
test size 10598


In [13]:
model = GradientBoostingClassifier(random_state=7, verbose=1)
model.fit(datasets['train']['X'], datasets['train']['Y'],
          sample_weight=datasets['train']['scaling'])

      Iter       Train Loss   Remaining Time 
         1           1.2904           17.33s
         2           1.2804           15.39s
         3           1.2722           14.87s
         4           1.2655           14.47s
         5           1.2600           14.14s
         6           1.2554           13.96s
         7           1.2512           13.79s
         8           1.2480           13.57s
         9           1.2451           13.35s
        10           1.2425           13.16s
        20           1.2283           11.48s
        30           1.2199            9.90s
        40           1.2139            8.36s
        50           1.2085            6.91s
        60           1.2042            5.49s
        70           1.2000            4.10s
        80           1.1962            2.72s
        90           1.1924            1.35s
       100           1.1888            0.00s


GradientBoostingClassifier(random_state=7, verbose=1)

In [14]:
eval_dict = {'accuracy': metrics.accuracy_score,
             'f1': metrics.f1_score,
             'precision': metrics.precision_score,
             'recall': metrics.recall_score,
             'roc-auc': metrics.roc_auc_score}
for name, data_dict in datasets.items():
    predictions = model.predict(data_dict['X'])
    metrics_dict = {}
    for n, m in eval_dict.items():
        metrics_dict[n] = m(data_dict['Y'], predictions,
                            sample_weight=data_dict['scaling'])
    metrics_dict['name'] = name
    data_dict['Y_hat'] = predictions
    data_dict['performance'] = metrics_dict

In [15]:
performance_metrics = []
for name, data_dict in datasets.items():
    performance_metrics.append(data_dict['performance'])
performance_metrics_df = pd.DataFrame(performance_metrics)
performance_metrics_df

Unnamed: 0,accuracy,f1,precision,recall,roc-auc,name
0,0.687728,0.782799,0.708484,0.874531,0.61257,train
1,0.664332,0.760101,0.696864,0.83596,0.600131,val
2,0.654494,0.751024,0.692025,0.821019,0.59309,test


### Feature Importance

In [16]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(datasets['train']['X'].todense())

In [17]:
shap_values_df = pd.DataFrame(shap_values.values, columns=feature_names)
feature_df = pd.DataFrame(shap_values.data, columns=feature_names)

In [18]:
feat_summary = (shap_values_df.abs().mean(axis=0)
                .reset_index(name='avg_abs_shap')
                .rename({'index': 'feat'}, axis=1)
                .sort_values('avg_abs_shap', ascending=False))
top_features = feat_summary.head(20).reset_index(drop=True)

In [19]:
for i, row in tqdm(top_features.iterrows(), total=len(top_features)):
    if (shap_values_df[row['feat']].nunique() > 1):
        top_features.at[i, 'r'] = np.corrcoef(feature_df[row['feat']],
                                              shap_values_df[row['feat']])[0, 1]
        top_features.at[i, 'n_nonzero'] = (feature_df[row['feat']] > 0).sum()
    else:
        top_features.at[i, 'r'] = 0

100%|██████████| 20/20 [00:25<00:00,  1.26s/it]


In [20]:
top_features


Unnamed: 0,feat,avg_abs_shap,r,n_nonzero
0,https,0.395061,0.729915,56181.0
1,this,0.058848,0.934329,27986.0
2,the,0.049621,0.69532,60968.0
3,according,0.016566,0.970726,1814.0
4,www,0.014184,0.800714,38489.0
5,article,0.013888,0.891142,7559.0
6,quot,0.011161,0.868692,10152.0
7,covid,0.010813,0.810689,12017.0
8,in,0.009827,0.646651,32519.0
9,from,0.008615,0.830583,10849.0
