In [27]:
import pandas as pd

# Process the Training File

In [28]:
filename = "drugsComTrain_raw.tsv"
POSITIVE_THRESHOLD = 8
NEGATIVE_THRESHOLD = 3

In [29]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [30]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [31]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [32]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [33]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [34]:
df = df.rename(columns={
    "review":"sentence",
    "rating":"label"
})

In [35]:
train = df.sample(frac=0.8, random_state=1234)
dev = df.loc[~df.index.isin(train.index)]

In [36]:
sum(train['label']==0)

23114

In [37]:
sum(train['label']==1)

77907

In [38]:
sum(train['label']==-1)

28017

In [39]:
df

Unnamed: 0,sentence,label
0,"It has no side effect, I take it in combinatio...",1
1,My son is halfway through his fourth week of I...,1
2,"I used to take another oral contraceptive, whi...",0
3,This is my first time using any form of birth ...,1
4,Suboxone has completely turned my life around....,1
...,...,...
161292,I wrote my first report in Mid-October of 2014...,1
161293,I was given this in IV before surgey. I immedi...,-1
161294,"Limited improvement after 4 months, developed ...",-1
161295,"I&#039;ve been on thyroid medication 49 years,...",1


In [13]:
# We wish to preserve all records for train
df.to_csv('../datasets/SENT/train.tsv', sep='\t', index=False, header=False)

In [14]:
dev.to_csv('../datasets/SENT/dev.tsv', sep='\t', index=False, header=False)

# Process the Test File

In [15]:
filename = "drugsComTest_raw.tsv"

In [16]:
df = pd.read_csv('../datasets/drugsCom_raw/'+filename, delimiter='\t')

In [17]:
# Remove Beginning and Ending Quotation Marks
df['review'] = df['review'].apply(lambda x: x[1:-1])

In [18]:
def makeSentimentScore(x):
    if x >= POSITIVE_THRESHOLD:
        return 1
    elif x <= NEGATIVE_THRESHOLD:
        return -1
    else:
        return 0

In [19]:
# Convert ratings to Positive and Negative Sentiment
df['rating'] = df['rating'].apply(makeSentimentScore)

In [20]:
# Remove the unnecessary columns
df = df.drop(['date', 'Unnamed: 0', 'drugName', 'condition', 'usefulCount'], axis=1)

In [21]:
sum(df['rating']==0)

9579

In [22]:
sum(df['rating']==1)

32349

In [23]:
sum(df['rating']==-1)

11838

In [24]:
# Add the index as a column
#df = df.reset_index()

In [25]:
# df = df.rename(columns={
#     "review":"sentence",
#     "rating":"label"
# })

In [26]:
df.to_csv('../datasets/SENT/test.tsv', sep='\t', index=False, header=False)