# Split Notes Between Training, Validation, and Testing

In [67]:
import pandas as pd
from sklearn.model_selection import GroupKFold

In [68]:
notes = pd.read_table('../data/notes-00000.tsv')
tweets = pd.read_csv('../data/tweet_text2021-11-04.csv')
ratings = pd.read_csv('../data/ratings-00000.tsv', sep='\t')


In [69]:
english_tweet_ids = tweets.loc[tweets['lang'] == 'en', 'id']
print('Filtering', len(tweets) - len(english_tweet_ids), 'out of', len(tweets),
      'tweets written in languages other than English')

Filtering 761 out of 12874 tweets written in languages other than English


In [70]:
filtered_notes = notes[notes['tweetId'].isin(english_tweet_ids)].reset_index(drop=True)

In [71]:
groups = filtered_notes['tweetId']
group_kfold = GroupKFold(n_splits=10) #1/10 -> 10% for val and testing
group_kfold.get_n_splits(X=filtered_notes, groups=groups)
train_notes_list = []
for i, (_, test_index) in enumerate(group_kfold.split(X=filtered_notes, groups=groups)):
    if i == 0:
        test_notes = filtered_notes.iloc[test_index, :]
    elif i == 1:
        val_notes = filtered_notes.iloc[test_index, :]
    else:
        train_notes_list.append(filtered_notes.iloc[test_index, :])
train_notes = pd.concat(train_notes_list)

In [72]:
tweets.columns, ratings.columns

(Index(['lang', 'created_at', 'id', 'text', 'author_id', 'possibly_sensitive',
        'public_metrics.retweet_count', 'public_metrics.reply_count',
        'public_metrics.like_count', 'public_metrics.quote_count',
        'withheld.copyright', 'withheld.country_codes', 'withheld.scope'],
       dtype='object'),
 Index(['noteId', 'participantId', 'createdAtMillis', 'version', 'agree',
        'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
        'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
        'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
        'helpfulImportantContext', 'notHelpfulOther', 'notHelpfulIncorrect',
        'notHelpfulSourcesMissingOrUnreliable',
        'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
        'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
        'notHelpfulArgumentativeOrInflammatory', 'notHelpfulOffTopic',
        'notHelpfulSpamHarassmentOrAbuse', 'notHelpf

In [73]:
notes_columns = ['noteId', 'tweetId',
       'classification', 'believable', 'harmful', 'validationDifficulty',
       'misleadingOther', 'misleadingFactualError',
       'misleadingManipulatedMedia', 'misleadingOutdatedInformation',
       'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact',
       'misleadingSatire', 'notMisleadingOther',
       'notMisleadingFactuallyCorrect',
       'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire',
       'notMisleadingPersonalOpinion', 'trustworthySources', 'summary']

tweets_columns = ['id', 'text', 'author_id', 'possibly_sensitive']

ratings_columns = ['noteId', 'agree',
        'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
        'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
        'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
        'helpfulImportantContext', 'notHelpfulOther', 'notHelpfulIncorrect',
        'notHelpfulSourcesMissingOrUnreliable',
        'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
        'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
        'notHelpfulArgumentativeOrInflammatory', 'notHelpfulOffTopic',
        'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources']

In [74]:
train = train_notes[notes_columns].join(tweets[tweets_columns].set_index('id'), on='tweetId', how='left', lsuffix='_x', rsuffix='_y')
test = test_notes[notes_columns].join(tweets[tweets_columns].set_index('id'), on='tweetId', how='left', lsuffix='_x', rsuffix='_y')
val = val_notes[notes_columns].join(tweets[tweets_columns].set_index('id'), on='tweetId', how='left', lsuffix='_x', rsuffix='_y')

In [75]:
print('Training set size:', train.shape[0])
print('Validation set set:', val.shape[0])
print('Testing set size:', test.shape[0])

Training set size: 12688
Validation set set: 1586
Testing set size: 1587


In [76]:
train = train.join(ratings[ratings_columns].dropna().set_index('noteId'), on='noteId', how='left', lsuffix='_x', rsuffix='_y')
test = test.join(ratings[ratings_columns].dropna().set_index('noteId'), on='noteId', how='left', lsuffix='_x', rsuffix='_y')
val = val.join(ratings[ratings_columns].dropna().set_index('noteId'), on='noteId',  how='left', lsuffix='_x', rsuffix='_y')

In [77]:
print('Training set size:', train.shape[0])
print('Validation set set:', val.shape[0])
print('Testing set size:', test.shape[0])

Training set size: 69205
Validation set set: 10382
Testing set size: 7555


In [78]:
print(train_notes['tweetId'].isin(val_notes['tweetId']).sum())
print(train_notes['tweetId'].isin(test_notes['tweetId']).sum())
print(val_notes['tweetId'].isin(test_notes['tweetId']).sum())

0
0
0


In [79]:
print(train.columns)
print(test.columns)
print(val.columns)

Index(['noteId', 'tweetId', 'classification', 'believable', 'harmful',
       'validationDifficulty', 'misleadingOther', 'misleadingFactualError',
       'misleadingManipulatedMedia', 'misleadingOutdatedInformation',
       'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact',
       'misleadingSatire', 'notMisleadingOther',
       'notMisleadingFactuallyCorrect',
       'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire',
       'notMisleadingPersonalOpinion', 'trustworthySources', 'summary', 'text',
       'author_id', 'possibly_sensitive', 'agree', 'disagree', 'helpful',
       'notHelpful', 'helpfulnessLevel', 'helpfulOther', 'helpfulInformative',
       'helpfulClear', 'helpfulEmpathetic', 'helpfulGoodSources',
       'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'notHelpfulOther', 'notHelpfulIncorrect',
       'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMi

In [81]:
train.to_csv('../data/train_split_v2.csv', index=False)
val.to_csv('../data/val_split_v2.csv', index=False)
test.to_csv('../data/test_split_v2.csv', index=False)