In [1]:
import json
import os

import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from tqdm.notebook import tqdm

In [2]:
from html.parser import HTMLParser


class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def get_data(self):
        return "".join(self.fed)


def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

In [3]:
def read_json_line(line=None):
    result = None
    try:
        result = json.loads(line)
    except Exception as e:
        # Find the offending character index:
        idx_to_replace = int(str(e).split(" ")[-1].replace(")", ""))
        # Remove the offending character:
        new_line = list(line)
        new_line[idx_to_replace] = " "
        new_line = "".join(new_line)
        return read_json_line(line=new_line)
    return result

In [4]:
def extract_features_and_write(path_to_data, inp_filename, is_train=True):

    features = ["content", "published", "title", "author"]
    prefix = "train" if is_train else "test"
    feature_files = [
        open(
            os.path.join(path_to_data, "{}_{}.txt".format(prefix, feat)),
            "w",
            encoding="utf-8",
        )
        for feat in features
    ]

    with open(
        os.path.join(path_to_data, inp_filename), encoding="utf-8"
    ) as inp_json_file:

        for line in tqdm(inp_json_file):
            json_data = read_json_line(line)


# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [5]:
PATH_TO_DATA = "../../_static/data/assignment6/"  # modify this if you need to

In [6]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [7]:
X_train_content_sparse = csr_matrix(np.empty([10, 100000]))  # change this
X_train_title_sparse = csr_matrix(np.empty([10, 100000]))  # change this
X_train_author_sparse = csr_matrix(np.empty([10, 100000]))  # change this
X_train_time_features_sparse = csr_matrix(np.empty([10, 5]))  # change this

X_test_content_sparse = csr_matrix(np.empty([5, 100000]))  # change this
X_test_title_sparse = csr_matrix(np.empty([5, 100000]))  # change this
X_test_author_sparse = csr_matrix(np.empty([5, 100000]))  # change this
X_test_time_features_sparse = csr_matrix(np.empty([5, 5]))  # change this

In [8]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [9]:
X_train_sparse = hstack(
    [
        X_train_content_sparse,
        X_train_title_sparse,
        X_train_author_sparse,
        X_train_time_features_sparse,
    ]
).tocsr()

In [10]:
X_test_sparse = hstack(
    [
        X_test_content_sparse,
        X_test_title_sparse,
        X_test_author_sparse,
        X_test_time_features_sparse,
    ]
).tocsr()

In [11]:
train_target = pd.read_csv(
    os.path.join(PATH_TO_DATA, "train_log1p_recommends.csv"), index_col="id"
)
y_train = train_target["log_recommends"].values

In [12]:
train_part_size = int(0.7 * train_target.shape[0])
X_train_part_sparse = X_train_sparse[:train_part_size, :]
y_train_part = y_train[:train_part_size]
X_valid_sparse = X_train_sparse[train_part_size:, :]
y_valid = y_train[train_part_size:]

In [13]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [14]:
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [15]:
ridge_test_pred = np.empty([34645, 1])  # change this

In [16]:
def write_submission_file(
    prediction,
    filename,
    path_to_sample=os.path.join(PATH_TO_DATA, "sample_submission.csv"),
):
    submission = pd.read_csv(path_to_sample, index_col="id")

    submission["log_recommends"] = prediction
    submission.to_csv(filename)

In [17]:
write_submission_file(
    ridge_test_pred, os.path.join(PATH_TO_DATA, "assignment6_medium_submission.csv")
)

In [18]:
write_submission_file(
    np.zeros_like(ridge_test_pred),
    os.path.join(PATH_TO_DATA, "medium_all_zeros_submission.csv"),
)

In [19]:
ridge_test_pred_modif = ridge_test_pred
# You code here (read-only in a JupyterBook, pls run jupyter-notebook to edit)

In [20]:
write_submission_file(
    ridge_test_pred_modif,
    os.path.join(PATH_TO_DATA, "assignment6_medium_submission_with_hack.csv"),
)