In [1]:
import os
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from datetime import datetime

nltk.download('stopwords')
nltk.download('punkt')

INPUT_DATA_DIR = "../data/input"
OUTPUT_DATA_DIR = "../data/output"

GP_DATA_NAME = "gplay/gplay_reviews.csv"
AMAZON_DATA_NAME = "amazon/amazon_reviews.csv"

GP_INPUT_DATA = os.path.join(INPUT_DATA_DIR, GP_DATA_NAME)
AMAZON_INPUT_DATA = os.path.join(INPUT_DATA_DIR, AMAZON_DATA_NAME)

current_date = datetime.today().strftime('%Y%m%d')
INTERMEDIARY_CSV = os.path.join(OUTPUT_DATA_DIR, current_date + "_merged_preprocessed.csv")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/beerphilipp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/beerphilipp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Merge the Google Play Store and Amazon datasets

In [2]:
gp = pd.read_csv(GP_INPUT_DATA, usecols=['CleanedStop_review_body1', 'current_rating'])
gp.rename(columns={'CleanedStop_review_body1': 'text', 'current_rating': 'rating'}, inplace=True)
gp['source'] = 'GP'

am = pd.read_csv(AMAZON_INPUT_DATA, usecols=['Review_text', 'Rating'])
am.rename(columns={'Review_text': 'text', 'Rating': 'rating'}, inplace=True)
am['source'] = 'Amazon'

merged = pd.concat([gp, am], ignore_index=True)

# Filter stopwords using `ntlk`

In [3]:
stop_words = set(stopwords.words('english'))

def clean_text(row):
    text = str(row['text'])
    tokens = word_tokenize(text)
    filtered = [w for w in tokens if not w.lower() in stop_words]
    return filtered

merged['cleaned_text'] = merged.apply(clean_text, axis = 1)

# Save the result to `data/ouput/[yyyymmdd]_merged_preprocessed.csv`

In [4]:
merged.to_csv(INTERMEDIARY_CSV, index=False)