In [1]:
"""Run this file one time to preprocess the data."""
import sys
if ".." not in sys.path:
    sys.path.insert(0, "..")

import numpy as np
import pandas as pd
import scipy.sparse as sp

from DataHandling.db_reader import Reader
from DataHandling.language_processing import preprocess

from sklearn.feature_extraction.text import TfidfVectorizer



In [2]:
# load data
print("Reading data...")
reader = Reader()
titles = reader.get_titles()
bodies = reader.get_bodies()
urls = reader.get_urls()
reader.close()
print("Reading data done")

Reading data...
Reading data done


In [3]:
df = pd.DataFrame({"url": urls, "body": bodies, "title": titles})

In [4]:
# check for duplicates
print("Number of duplicates:", df.duplicated().sum())
print("Number of duplicates:", df['body'].duplicated().sum())
print("Number of duplicates:", df['url'].duplicated().sum())
print("Number of duplicates:", df['title'].duplicated().sum())


Number of duplicates: 131
Number of duplicates: 59874
Number of duplicates: 131
Number of duplicates: 89609


In [5]:
# drop rows with duplicates in column body
df.drop_duplicates(subset=['body'], inplace=True)
print(len(df))

# drop rows with duplicates in column url
df.drop_duplicates(subset=['url'], inplace=True)
print(len(df))

# drop rows where body is empty string
df = df[df['body'] != '']
print(len(df))

67768
67768
67767


In [7]:
# # preprocess data
titles_processed = [preprocess(title) for title in df['title']]
bodies_processed = [preprocess(body) for body in df['body']]

# # load preprocessed data
# titles_processed = np.load("../../../data/titles_processed.npy", allow_pickle=True)
# bodies_processed = np.load("../../../data/bodies_processed.npy", allow_pickle=True)

In [8]:
bodies_processed = np.array(bodies_processed, dtype=object)
titles_processed = np.array(titles_processed, dtype=object)
bodies_processed.shape
titles_processed.shape

(67767,)

In [9]:
# store as npy-files
np.save("../../../data/titles_processed.npy", titles_processed)
np.save("../../../data/bodies_processed.npy", bodies_processed)
np.save("../../../data/urls.npy", urls)

In [10]:
assert (bodies_processed == np.load("../../../data/bodies_processed.npy", allow_pickle=True)).all()
assert (titles_processed == np.load("../../../data/titles_processed.npy", allow_pickle=True)).all()
assert (urls == np.load("../../../data/urls.npy", allow_pickle=True)).all()

In [11]:
# data embedding 
vectorizer = TfidfVectorizer()
title_embedding = vectorizer.fit_transform(titles_processed)
body_embedding = vectorizer.fit_transform(bodies_processed)

In [15]:
print(title_embedding.shape)
print(body_embedding.shape)

(67767, 34724)
(67767, 1459964)


scipy.sparse._csr.csr_matrix

In [13]:
sp.csr_matrix(body_embedding)

<67767x1459964 sparse matrix of type '<class 'numpy.float64'>'
	with 23973641 stored elements in Compressed Sparse Row format>

In [14]:
sp.save_npz("../../../data/title_embedding.npz", sp.csr_matrix(title_embedding))
sp.save_npz("../../../data/body_embedding.npz", sp.csr_matrix(body_embedding))