In [1]:
"""Run this file one time to preprocess the data."""
import sys

if ".." not in sys.path:
    sys.path.insert(0, "..")

import numpy as np
import pandas as pd
import scipy.sparse as sp

from DataHandling.db_reader import Reader
from DataHandling.language_processing import preprocess

In [2]:
# load data
print("Reading data...")
reader = Reader()
titles = reader.get_titles()
bodies = reader.get_bodies()
urls = reader.get_urls()
reader.close()
print("Reading data done")

Reading data...
Reading data done


In [3]:
df = pd.DataFrame({"url": urls, "body": bodies, "title": titles})
df.shape

(127642, 3)

In [4]:
# check for duplicates
print("Number of duplicates:", df.duplicated().sum())
print("Number of duplicates:", df["body"].duplicated().sum())
print("Number of duplicates:", df["url"].duplicated().sum())
print("Number of duplicates:", df["title"].duplicated().sum())

Number of duplicates: 131
Number of duplicates: 59874
Number of duplicates: 131
Number of duplicates: 89609


In [5]:
# drop rows where tübingen is not contained
df = df[
    df["body"].str.contains("tübingen", case=False)
    | df["title"].str.contains("tübingen", case=False)
    | df["url"].str.contains("tübingen", case=False)
]
print(len(df))

100259


In [6]:
# drop rows with duplicates in column body
df.drop_duplicates(subset=["body"], inplace=True)
print(len(df))

# drop rows with duplicates in column url
df.drop_duplicates(subset=["url"], inplace=True)

# drop rows where body is empty string
df = df[df["body"] != ""]

# drop rows where title is empty string
df = df[df["title"] != ""]

# drop rows specific to tripadvisor user profiles
df = df[~df["url"].str.contains("UserReview")]
df = df[~df["url"].str.contains("tripadvisor.com/Profile")]
print(len(df))

60825
55550


In [7]:
np.save("../../../data/bodies.npy", df["body"].values)
np.save("../../../data/titles.npy", df["title"].values)
np.save("../../../data/urls.npy", df["url"].values)

In [8]:
# # preprocess data
# titles_processed = [preprocess(title) for title in df['title']]
# bodies_processed = [preprocess(body) for body in df['body']]

# load preprocessed data
titles_processed = np.load("../../../data/titles_processed.npy", allow_pickle=True)
bodies_processed = np.load("../../../data/bodies_processed.npy", allow_pickle=True)

In [9]:
bodies_processed = np.array(bodies_processed, dtype=object)
titles_processed = np.array(titles_processed, dtype=object)
bodies_processed.shape
titles_processed.shape

(67767,)

In [10]:
# store as npy-files
np.save("../../../data/titles_processed.npy", titles_processed)
np.save("../../../data/bodies_processed.npy", bodies_processed)
np.save("../../../data/urls.npy", df["url"])

In [11]:
assert (
    bodies_processed == np.load("../../../data/bodies_processed.npy", allow_pickle=True)
).all()
assert (
    titles_processed == np.load("../../../data/titles_processed.npy", allow_pickle=True)
).all()
assert (df["url"] == np.load("../../../data/urls.npy", allow_pickle=True)).all()