# Split Notes Between Training, Validation, and Testing

In [1]:
import pandas as pd
from sklearn.model_selection import GroupKFold

In [2]:
notes = pd.read_table('../data/notes-00000.tsv')
tweets = pd.read_csv('../data/tweet_text2021-11-04.csv')

In [3]:
english_tweet_ids = tweets.loc[tweets['lang'] == 'en', 'id']
print('Filtering', len(tweets) - len(english_tweet_ids), 'out of', len(tweets),
      'tweets written in languages other than English')

Filtering 761 out of 12874 tweets written in languages other than English


In [4]:
filtered_notes = notes[notes['tweetId'].isin(english_tweet_ids)].reset_index(drop=True)

In [5]:
groups = filtered_notes['tweetId']
group_kfold = GroupKFold(n_splits=10) #1/10 -> 10% for val and testing
group_kfold.get_n_splits(X=filtered_notes, groups=groups)
train_notes_list = []
for i, (_, test_index) in enumerate(group_kfold.split(X=filtered_notes, groups=groups)):
    if i == 0:
        test_notes = filtered_notes.iloc[test_index, :]
    elif i == 1:
        val_notes = filtered_notes.iloc[test_index, :]
    else:
        train_notes_list.append(filtered_notes.iloc[test_index, :])
train_notes = pd.concat(train_notes_list)

In [6]:
print('Training set size:', train_notes.shape[0])
print('Validation set set:', val_notes.shape[0])
print('Testing set size:', test_notes.shape[0])

Training set size: 12688
Validation set set: 1586
Testing set size: 1587


In [11]:
print(train_notes['tweetId'].isin(val_notes['tweetId']).sum())
print(train_notes['tweetId'].isin(test_notes['tweetId']).sum())
print(val_notes['tweetId'].isin(test_notes['tweetId']).sum())

0
0
0


In [9]:
train_notes.to_csv('../data/train_split_v1.csv', index=False)
val_notes.to_csv('../data/val_split_v1.csv', index=False)
test_notes.to_csv('../data/test_split_v1.csv', index=False)