# Create Train/Dev Datasets for BERT Models

This notebook takes an input of three training dataset files from the SMM4H Shared Task 2019 containing labeled tweets and creates a train and dev set, split 80/20.

## Load Data

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

prefix = 'data/'

In [2]:
# load all three files and combine into one DF
train1_df = pd.read_csv(prefix + 'training_set_1.tsv', sep="\t", names = ['tweet_id','user_id','label','tweet'])
train2_df = pd.read_csv(prefix + 'training_set_2.tsv', sep="\t", names = ['tweet_id','user_id','label','tweet'])
train3_df = pd.read_csv(prefix + 'training_set_3.tsv', sep="\t", names = ['tweet_id','user_id','label','tweet'])
df = pd.concat([train1_df,train2_df,train3_df],axis=0,sort=False)
df.head()

Unnamed: 0,tweet_id,user_id,label,tweet
0,343909778008973312,464336224,0,i don't fucking need humira
1,352823276889837570,590337731,0,"my retake is next friday, if i bloody fail aga..."
2,339867818843594756,246979971,0,"@doctorchristian scared to start fluoxetine, w..."
3,349294537367236611,149749939,0,"@intuitivegal1 ok, if you stopped taking the l..."
4,354256195432882177,54516759,0,novartis announces secukinumab (ain457) demons...


## Split into Train/Dev (80/20 split)

In [3]:
train_df, dev_df = train_test_split(df, test_size=0.2)

In [4]:
dev_df.head()

Unnamed: 0,tweet_id,user_id,label,tweet
3985,339781959465193472,253181726,0,there's not enough prozac in the world to help...
7713,540744587925811200,551827811,0,a bizarre albuterol aroma and hedonistic marig...
2879,332542581760204800,75858565,1,@gastromom the only pt of mine who ever died w...
159,356795624592646144,15285788,0,glenmark confirms patent challenge for its gen...
133,526289586171817985,2502942848,0,impede marc anthony labor day advair diskus lo...


## Convert to format expected by preprocessor

In [5]:
train_df = pd.DataFrame({
    'id':range(len(train_df)),
    'label':train_df['label'],
    'text': train_df['tweet'].replace(r'\n', ' ', regex=True)
})

train_df.head()

Unnamed: 0,id,label,text
1171,0,0,@nardsbbq hey bud! when is boniva again?
1928,1,0,wrecking everyone in trivia crack right now ca...
678,2,0,i need a few trazodone and a good nights sleep...
4371,3,0,@revelwoman so what do you take for the pn?? ...
2159,4,0,"@cthomse i take extavia think it is helping, m..."


In [6]:
dev_df = pd.DataFrame({
    'id':range(len(dev_df)),
    'label':dev_df['label'],
    'text': dev_df['tweet'].replace(r'\n', ' ', regex=True)
})

dev_df.head()

Unnamed: 0,id,label,text
3985,0,0,there's not enough prozac in the world to help...
7713,1,0,a bizarre albuterol aroma and hedonistic marig...
2879,2,1,@gastromom the only pt of mine who ever died w...
159,3,0,glenmark confirms patent challenge for its gen...
133,4,0,impede marc anthony labor day advair diskus lo...


In [14]:
train_df.to_csv('data/train.tsv', sep='\t', index=False, header=False, columns=train_df.columns)
dev_df.to_csv('data/dev.tsv', sep='\t', index=False, header=False, columns=dev_df.columns)