In [9]:
import pandas as pd

# Process the Training File

In [10]:
filename = "drugsComTrain_raw.tsv"
POSITIVE_THRESHOLD = 8
NEGATIVE_THRESHOLD = 3

In [11]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [12]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [13]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [14]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [15]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [16]:
df = df.rename(columns={
    "review":"sentence",
    "rating":"label"
})

In [17]:
train = df.sample(frac=0.8, random_state=1234)
dev = df.loc[~df.index.isin(train.index)]

In [18]:
# We wish to preserve all records for train
df.to_csv('../datasets/SENT/train.tsv', sep='\t', index=False, header=False)

In [19]:
dev.to_csv('../datasets/SENT/dev.tsv', sep='\t', index=False, header=False)

# Process the Test File

In [20]:
filename = "drugsComTest_raw.tsv"

In [21]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [22]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [23]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [24]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [25]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [26]:
# Add the index as a column
#df = df.reset_index()

In [27]:
# df = df.rename(columns={
#     "review":"sentence",
#     "rating":"label"
# })

In [28]:
df.to_csv('../datasets/SENT/test.tsv', sep='\t', index=False, header=False)