# This script creates the different datasets that will be run on the models to explore the implications of duplicates

1. get full original test and train - make model
2. concatenate originals, drop dups, split train_test
3. get #2 and add scraped and drop dups, then split


In [36]:
from google.colab import drive
drive.mount('/content/drive')
import os
import pandas as pd
from sklearn.model_selection import train_test_split

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
def create_cols(df):
    """
    takes pandas df
    returns pandas df
    """

    # code NAs as a "Not Entered" category
    df.loc[df['condition'].isna(), 'condition'] = 'Not Entered'

    # creates ratings category by binning ratings
    df['rating_category'] = 'Positive'
    df.loc[df['rating'] < 7, 'rating_category'] = 'Neutral'
    df.loc[df['rating'] < 4, 'rating_category'] = 'Negative'

    return df

def replace_html(df):
    df['review'] = df['review'].str.replace("&#039;", "'")
    return df

In [38]:
PATH = './drive/MyDrive/drugproject/3_data'
TRAIN = os.path.join(PATH, 'drugsComTrain_raw.tsv')
TEST = os.path.join(PATH, 'drugsComTest_raw.tsv')
SCRAPE = os.path.join(PATH, 'drugComScrapedData.tsv')

In [39]:
def clean_export(file, outfile, tsv=True):
    if tsv:
      df = pd.read_csv(file, sep='\t')
    else:
      df = pd.read_csv(file)
    if "Scraped" not in file:
      df = df[list(df.columns[1:])]
    df= create_cols(replace_html(df))
    out = os.path.join(PATH, outfile)

    df.to_csv(out, index=False)

\#1

In [40]:
clean_export(TRAIN, 'duplicate_exploration/train_original.csv')
clean_export(TEST, 'duplicate_exploration/test_original.csv')

\#2

In [41]:
all_df = pd.concat([pd.read_csv(TRAIN, sep='\t'), pd.read_csv(TEST, sep='\t')]).drop_duplicates('review')

In [42]:
train, test = train_test_split(all_df)

In [43]:
train = train[train.columns[1:]]
test = test[test.columns[1:]]

In [44]:
train = create_cols(replace_html(train))
test = create_cols(replace_html(test))

In [45]:
train.to_csv(os.path.join(PATH,'duplicate_exploration/train_original_clean.csv'), index=False)
test.to_csv(os.path.join(PATH,'duplicate_exploration/test_original_clean.csv'), index=False)

\#3

In [46]:
all_df = pd.concat([all_df, pd.read_csv(SCRAPE, sep='\t') ]).drop_duplicates('review')

In [47]:
train_all, test_all = train_test_split(all_df)

In [48]:
train_all = train_all[train_all.columns[1:]]
test_all = test_all[test_all.columns[1:]]

In [49]:
train_all = create_cols(replace_html(train_all))
test_all = create_cols(replace_html(test_all))

In [50]:
train_all.to_csv(os.path.join(PATH,'duplicate_exploration/train_enhanced_clean.csv'), index=False)
test_all.to_csv(os.path.join(PATH,'duplicate_exploration/test_enhanced_clean.csv'), index=False)