In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

# Process the Training File

In [2]:
filename = "binary_tweets_valid.csv"

In [3]:
df = pd.read_csv('./twitter data/'+filename, lineterminator='\n')

In [4]:
# Remove the unnecessary columns
df = df.drop(['index', 'tweet_id', 'user_id', 'url'], axis=1)

In [5]:
df = df.rename(columns={
    "tweet":"sentence",
    "id":"label"
})

In [6]:
df = df[['sentence','label']]

In [7]:
df.head()

Unnamed: 0,sentence,label
0,"@DoctorChristian scared to start fluoxetine, w...",0
1,"@IntuitiveGal1 ok, if you stopped taking the L...",0
2,Novartis announces secukinumab (AIN457) demons...,0
3,"""U wailed all night; now y'r disembodied sobbi...",1
4,@irapaps you're so fucking selfish. I've got L...,0


In [8]:
#remove newline characters from tweets
def preprocess(s):
  s = re.sub(r'\n', ' ', s)
  return(s)
df['sentence'] = df['sentence'].apply(lambda x: preprocess(x))

In [9]:
train_dev_data, test_data = train_test_split(df, test_size=0.2, random_state=0, stratify=df["label"])
train_data, dev_data = train_test_split(train_dev_data, test_size=0.25, random_state=0, stratify=train_dev_data["label"])

In [10]:
train_data.reset_index(inplace=True, drop=True)
dev_data.reset_index(inplace=True, drop=True)
test_data.reset_index(inplace=True, drop=True)

In [11]:
train_data.shape[0], dev_data.shape[0], test_data.shape[0]

(2501, 834, 834)

In [12]:
def class_proportion(data):
    value_counts = data["label"].value_counts()
    print("number of negative class examples:", value_counts[0])
    print("number of positive class examples:", value_counts[1])
    print("proportion of positive class examples:", (value_counts[1]/data["label"].shape[0]))

In [13]:
class_proportion(train_data)

number of negative class examples: 2224
number of positive class examples: 277
proportion of positive class examples: 0.11075569772091164


In [14]:
class_proportion(dev_data)

number of negative class examples: 742
number of positive class examples: 92
proportion of positive class examples: 0.11031175059952038


In [15]:
class_proportion(test_data)

number of negative class examples: 742
number of positive class examples: 92
proportion of positive class examples: 0.11031175059952038


In [16]:
train_data_duplicates = pd.concat([train_data[train_data["label"]==1]]*7, ignore_index=True)
train_data_rebalanced = pd.concat([train_data, train_data_duplicates], ignore_index=True)

In [17]:
class_proportion(train_data_rebalanced)

number of negative class examples: 2224
number of positive class examples: 2216
proportion of positive class examples: 0.4990990990990991


In [19]:
shuffle(train_data_rebalanced, random_state=0).to_csv('./twitter data/twitter_binary_balanced/train.tsv', sep='\t', index=False, header=False)
dev_data.to_csv('./twitter data/twitter_binary_balanced/dev.tsv', sep='\t', index=False, header=False)
test_data.to_csv('./twitter data/twitter_binary_balanced/test.tsv', sep='\t', index=False, header=False)