In [1]:
import pandas as pd

# Process the Training File

In [2]:
filename = "drugsComTrain_raw.tsv"
POSITIVE_THRESHOLD = 8
NEGATIVE_THRESHOLD = 3

In [3]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [4]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [5]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [6]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [7]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [8]:
df = df.rename(columns={
    "review":"sentence",
    "rating":"label"
})

In [9]:
train = df.sample(frac=0.8, random_state=1234)
dev = df.loc[~df.index.isin(train.index)]

In [20]:
sum(train['label']==0)

23114

In [21]:
sum(train['label']==1)

77907

In [22]:
sum(train['label']==-1)

28017

In [18]:
# We wish to preserve all records for train
df.to_csv('../datasets/SENT/train.tsv', sep='\t', index=False, header=False)

In [19]:
dev.to_csv('../datasets/SENT/dev.tsv', sep='\t', index=False, header=False)

# Process the Test File

In [23]:
filename = "drugsComTest_raw.tsv"

In [24]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [25]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [26]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [27]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [28]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [31]:
sum(df['rating']==0)

9579

In [32]:
sum(df['rating']==1)

32349

In [33]:
sum(df['rating']==-1)

11838

In [26]:
# Add the index as a column
#df = df.reset_index()

In [27]:
# df = df.rename(columns={
#     "review":"sentence",
#     "rating":"label"
# })

In [34]:
df.to_csv('../datasets/SENT/test.tsv', sep='\t', index=False, header=False)