In [1]:
import logging
import os

logging.basicConfig(level=logging.INFO)


if os.path.basename(os.getcwd()) == "snorkel-tutorials":
    os.chdir("recsys")


# %% [markdown]
# ## Loading Data

# %% [markdown]
# We start by running the `download_and_process_data` function.
# The function returns the `df_train`, `df_test`, `df_dev`, `df_valid` dataframes, which correspond to our training, test, development, and validation sets.
# Each of those dataframes has the following fields:
# * `user_idx`: A unique identifier for a user.
# * `book_idx`: A unique identifier for a book that is being rated by the user.
# * `book_idxs`: The set of books that the user has interacted with (read or planned to read).
# * `review_text`: Optional text review written by the user for the book.
# * `rating`: Either `0` (which means the user did not read or did not like the book) or `1` (which means the user read and liked the book). The `rating` field is missing for `df_train`.
# Our objective is to predict whether a given user (represented by the set of book_idxs the user has interacted with) will read and like any given book.
# That is, we want to train a model that takes a set of `book_idxs` (the user) and a single `book_idx` (the book to rate) and predicts the `rating`.
#
# In addition, `download_and_process_data` also returns the `df_books` dataframe, which contains one row per book, along with metadata for that book (such as `title` and `first_author`).

# %% {"tags": ["md-exclude-output"]}
from utils_rec import download_and_process_data

(df_train, df_test, df_dev, df_valid), df_books = download_and_process_data()

df_books.head()

# %% [markdown]
# We look at a sample of the labeled development set.
# As an example, we want our final recommendations model to be able to predict that a user who has interacted with `book_idxs` (25743, 22318, 7662, 6857, 83, 14495, 30664, ...) would either not read or not like the book with `book_idx` 22764 (first row), while a user who has interacted with `book_idxs` (3880, 18078, 9092, 29933, 1511, 8560, ...) would read and like the book with `book_idx` 3181 (second row).

# %%
df_dev.sample(frac=1, random_state=12).head()

# %% [markdown]
# ## Writing Labeling Functions

# %%
POSITIVE = 1
NEGATIVE = 0
ABSTAIN = -1

# %% [markdown]
# If a user has interacted with several books written by an author, there is a good chance that the user will read and like other books by the same author.
# We express this as a labeling function, using the `first_author` field in the `df_books` dataframe.
# We picked the threshold 15 by plotting histograms and running error analysis using the dev set.

# %%
from snorkel.labeling.lf import labeling_function

book_to_first_author = dict(zip(df_books.book_idx, df_books.first_author))
first_author_to_books_df = df_books.groupby("first_author")[["book_idx"]].agg(set)
first_author_to_books = dict(
    zip(first_author_to_books_df.index, first_author_to_books_df.book_idx)
)


@labeling_function(
    resources=dict(
        book_to_first_author=book_to_first_author,
        first_author_to_books=first_author_to_books,
    )
)
def shared_first_author(x, book_to_first_author, first_author_to_books):
    author = book_to_first_author[x.book_idx]
    same_author_books = first_author_to_books[author]
    num_read = len(set(x.book_idxs).intersection(same_author_books))
    return POSITIVE if num_read > 15 else ABSTAIN


# %% [markdown]
# We can also leverage the long text reviews written by users to guess whether they liked or disliked a book.
# For example, the third `df_dev` entry above has a review with the text `'4.5 STARS'`, which indicates that the user liked the book.
# We write a simple LF that looks for similar phrases to guess the user's rating of a book.
# We interpret >= 4 stars to indicate a positive rating, while < 4 stars is negative.

# %%
low_rating_strs = [
    "one star",
    "1 star",
    "two star",
    "2 star",
    "3 star",
    "three star",
    "3.5 star",
    "2.5 star",
    "1 out of 5",
    "2 out of 5",
    "3 out of 5",
]
high_rating_strs = ["5 stars", "five stars", "four stars", "4 stars", "4.5 stars"]

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
I1122 13:26:47.298351 139956583294784 utils_rec.py:213] Downloading raw data
I1122 13:26:47.299415 139956583294784 utils_rec.py:217] Processing book data
I1122 13:27:02.716694 139956583294784 utils_rec.py:219] Processing interaction data
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in t

In [2]:
@labeling_function(
    resources=dict(low_rating_strs=low_rating_strs)
)
def low_stars_in_review(x, low_rating_strs):
    if not isinstance(x.review_text, str):
        return ABSTAIN
    for low_rating_str in low_rating_strs:
        if low_rating_str in x.review_text.lower():
            return NEGATIVE
    return ABSTAIN

@labeling_function(
    resources=dict( high_rating_strs=high_rating_strs)
)
def high_stars_in_review(x, high_rating_strs):
    if not isinstance(x.review_text, str):
        return ABSTAIN
    for high_rating_str in high_rating_strs:
        if high_rating_str in x.review_text.lower():
            return POSITIVE
    return ABSTAIN


# %% [markdown]
# We can also run [TextBlob](https://textblob.readthedocs.io/en/dev/index.html), a tool that provides a pretrained sentiment analyzer, on the reviews, and use its polarity and subjectivity scores to estimate the user's rating for the book.
# As usual, these thresholds were picked by analyzing the score distributions and running error analysis.

# %%
from snorkel.preprocess import preprocessor
from textblob import TextBlob


@preprocessor(memoize=True)
def textblob_polarity(x):
    if isinstance(x.review_text, str):
        x.blob = TextBlob(x.review_text)
    else:
        x.blob = None
    return x


# Label high polarity reviews as positive.
@labeling_function(pre=[textblob_polarity])
def polarity_positive(x):
    if x.blob:
        if x.blob.polarity > 0.3:
            return POSITIVE
    return ABSTAIN


# Label high subjectivity reviews as positive.
@labeling_function(pre=[textblob_polarity])
def subjectivity_positive(x):
    if x.blob:
        if x.blob.subjectivity > 0.75:
            return POSITIVE
    return ABSTAIN


# Label low polarity reviews as negative.
@labeling_function(pre=[textblob_polarity])
def polarity_negative(x):
    if x.blob:
        if x.blob.polarity < 0.0:
            return NEGATIVE
    return ABSTAIN


# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling import PandasLFApplier, LFAnalysis

lfs = [
    low_stars_in_review,
    high_stars_in_review,
    shared_first_author,
    polarity_positive,
    subjectivity_positive,
    polarity_negative,
]

applier = PandasLFApplier(lfs)
#L_dev = applier.apply(df_dev)
L_train = applier.apply(df=df_train)
L_dev = applier.apply(df=df_dev)
L_valid = applier.apply(df=df_valid)
L_test = applier.apply(df=df_test)

100%|██████████| 795499/795499 [13:51<00:00, 957.10it/s]  
100%|██████████| 7849/7849 [00:07<00:00, 1020.67it/s]
100%|██████████| 7819/7819 [00:07<00:00, 1035.92it/s]
100%|██████████| 44560/44560 [00:52<00:00, 856.92it/s] 


In [4]:
L_train.shape, L_train.shape, L_dev.shape, L_valid.shape, L_test.shape

((795499, 6), (795499, 6), (7849, 6), (7819, 6), (44560, 6))

In [8]:
import numpy as np
np.where(L_dev==1)

(array([ 109,  111,  111, ..., 7846, 7847, 7848]),
 array([4, 1, 3, ..., 2, 2, 2]))

In [9]:
L_dev[109]

array([-1, -1, -1, -1,  1, -1])

In [23]:
# %% [markdown]
# ### Applying labeling functions to the training set
#
# We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel

In [24]:
# L_train = applier.apply(df_train)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01)
preds_train = label_model.predict(L_train)

from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
    df_train, preds_train, L_train
)
df_train_filtered["rating"] = preds_train_filtered
L_train_filtered = applier.apply(df_train_filtered)

I0901 00:43:16.650498 139997562042176 label_model.py:852] Computing O...
I0901 00:43:16.731963 139997562042176 label_model.py:858] Estimating \mu...
I0901 00:43:16.735948 139997562042176 logger.py:79] [0 epochs]: TRAIN:[loss=0.002]
I0901 00:43:16.769658 139997562042176 logger.py:79] [20 epochs]: TRAIN:[loss=0.000]
I0901 00:43:16.810812 139997562042176 logger.py:79] [40 epochs]: TRAIN:[loss=0.000]
I0901 00:43:16.853079 139997562042176 logger.py:79] [60 epochs]: TRAIN:[loss=0.000]
I0901 00:43:16.895682 139997562042176 logger.py:79] [80 epochs]: TRAIN:[loss=0.000]
I0901 00:43:16.938658 139997562042176 logger.py:79] [100 epochs]: TRAIN:[loss=0.000]
I0901 00:43:16.981927 139997562042176 logger.py:79] [120 epochs]: TRAIN:[loss=0.000]
I0901 00:43:17.025036 139997562042176 logger.py:79] [140 epochs]: TRAIN:[loss=0.000]
I0901 00:43:17.071420 139997562042176 logger.py:79] [160 epochs]: TRAIN:[loss=0.000]
I0901 00:43:17.114715 139997562042176 logger.py:79] [180 epochs]: TRAIN:[loss=0.000]
I0901 0

I0901 00:43:20.742369 139997562042176 logger.py:79] [1900 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.782642 139997562042176 logger.py:79] [1920 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.823032 139997562042176 logger.py:79] [1940 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.863444 139997562042176 logger.py:79] [1960 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.903653 139997562042176 logger.py:79] [1980 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.947899 139997562042176 logger.py:79] [2000 epochs]: TRAIN:[loss=0.000]
I0901 00:43:20.988172 139997562042176 logger.py:79] [2020 epochs]: TRAIN:[loss=0.000]
I0901 00:43:21.027753 139997562042176 logger.py:79] [2040 epochs]: TRAIN:[loss=0.000]
I0901 00:43:21.066870 139997562042176 logger.py:79] [2060 epochs]: TRAIN:[loss=0.000]
I0901 00:43:21.108462 139997562042176 logger.py:79] [2080 epochs]: TRAIN:[loss=0.000]
I0901 00:43:21.154072 139997562042176 logger.py:79] [2100 epochs]: TRAIN:[loss=0.000]
I0901 00:43:21.195968 139997562042176 logger.py:79] [2

I0901 00:43:24.721550 139997562042176 logger.py:79] [3820 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.760781 139997562042176 logger.py:79] [3840 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.801198 139997562042176 logger.py:79] [3860 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.840533 139997562042176 logger.py:79] [3880 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.879601 139997562042176 logger.py:79] [3900 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.917551 139997562042176 logger.py:79] [3920 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.957903 139997562042176 logger.py:79] [3940 epochs]: TRAIN:[loss=0.000]
I0901 00:43:24.997486 139997562042176 logger.py:79] [3960 epochs]: TRAIN:[loss=0.000]
I0901 00:43:25.036818 139997562042176 logger.py:79] [3980 epochs]: TRAIN:[loss=0.000]
I0901 00:43:25.078430 139997562042176 logger.py:79] [4000 epochs]: TRAIN:[loss=0.000]
I0901 00:43:25.120639 139997562042176 logger.py:79] [4020 epochs]: TRAIN:[loss=0.000]
I0901 00:43:25.160566 139997562042176 logger.py:79] [4

 11%|█         | 11668/105151 [00:22<02:48, 555.77it/s][A
 11%|█         | 11728/105151 [00:22<02:55, 533.67it/s][A
 11%|█         | 11785/105151 [00:22<03:05, 503.24it/s][A
 11%|█▏        | 11838/105151 [00:22<03:07, 498.07it/s][A
 11%|█▏        | 11890/105151 [00:23<03:33, 437.70it/s][A
 11%|█▏        | 11946/105151 [00:23<03:19, 467.83it/s][A
 11%|█▏        | 12004/105151 [00:23<03:09, 491.54it/s][A
 11%|█▏        | 12056/105151 [00:23<03:15, 476.06it/s][A
 12%|█▏        | 12106/105151 [00:23<03:16, 473.99it/s][A
 12%|█▏        | 12156/105151 [00:23<03:13, 479.74it/s][A
 12%|█▏        | 12214/105151 [00:23<03:05, 502.13it/s][A
 12%|█▏        | 12294/105151 [00:23<02:44, 564.34it/s][A
 12%|█▏        | 12390/105151 [00:23<02:24, 642.39it/s][A
 12%|█▏        | 12477/105151 [00:24<02:13, 693.55it/s][A
 12%|█▏        | 12552/105151 [00:24<02:29, 617.36it/s][A
 12%|█▏        | 12624/105151 [00:24<02:24, 642.06it/s][A
 12%|█▏        | 12695/105151 [00:24<02:20, 657.17it/s]

 28%|██▊       | 28956/105151 [00:56<02:50, 445.63it/s][A
 28%|██▊       | 29003/105151 [00:56<03:04, 412.85it/s][A
 28%|██▊       | 29083/105151 [00:56<02:37, 482.60it/s][A
 28%|██▊       | 29138/105151 [00:56<03:02, 415.96it/s][A
 28%|██▊       | 29188/105151 [00:56<02:54, 434.44it/s][A
 28%|██▊       | 29236/105151 [00:56<03:03, 412.72it/s][A
 28%|██▊       | 29308/105151 [00:56<02:41, 469.81it/s][A
 28%|██▊       | 29360/105151 [00:57<02:55, 431.21it/s][A
 28%|██▊       | 29408/105151 [00:57<03:00, 419.53it/s][A
 28%|██▊       | 29457/105151 [00:57<02:53, 436.83it/s][A
 28%|██▊       | 29507/105151 [00:57<02:46, 453.90it/s][A
 28%|██▊       | 29555/105151 [00:57<02:58, 422.40it/s][A
 28%|██▊       | 29599/105151 [00:57<02:57, 425.23it/s][A
 28%|██▊       | 29645/105151 [00:57<02:54, 432.64it/s][A
 28%|██▊       | 29698/105151 [00:57<02:45, 456.60it/s][A
 28%|██▊       | 29813/105151 [00:57<02:15, 557.06it/s][A
 28%|██▊       | 29880/105151 [00:58<02:10, 575.31it/s]

 45%|████▍     | 47069/105151 [01:30<01:46, 544.02it/s][A
 45%|████▍     | 47129/105151 [01:30<01:55, 502.01it/s][A
 45%|████▍     | 47184/105151 [01:30<01:55, 503.96it/s][A
 45%|████▍     | 47238/105151 [01:30<01:58, 488.22it/s][A
 45%|████▍     | 47290/105151 [01:30<02:11, 438.84it/s][A
 45%|████▌     | 47337/105151 [01:30<02:11, 439.09it/s][A
 45%|████▌     | 47390/105151 [01:31<02:04, 462.73it/s][A
 45%|████▌     | 47443/105151 [01:31<02:01, 476.45it/s][A
 45%|████▌     | 47492/105151 [01:31<02:06, 456.55it/s][A
 45%|████▌     | 47543/105151 [01:31<02:02, 470.51it/s][A
 45%|████▌     | 47595/105151 [01:31<01:58, 484.17it/s][A
 45%|████▌     | 47645/105151 [01:31<02:03, 465.94it/s][A
 45%|████▌     | 47693/105151 [01:31<02:04, 461.43it/s][A
 45%|████▌     | 47757/105151 [01:31<01:54, 503.44it/s][A
 45%|████▌     | 47843/105151 [01:31<01:40, 572.15it/s][A
 46%|████▌     | 47905/105151 [01:31<01:38, 583.21it/s][A
 46%|████▌     | 47967/105151 [01:32<01:51, 511.79it/s]

 60%|██████    | 63394/105151 [02:04<01:33, 448.50it/s][A
 60%|██████    | 63448/105151 [02:04<01:28, 471.09it/s][A
 60%|██████    | 63517/105151 [02:04<01:20, 516.25it/s][A
 60%|██████    | 63574/105151 [02:04<01:21, 511.69it/s][A
 61%|██████    | 63630/105151 [02:04<01:19, 524.19it/s][A
 61%|██████    | 63685/105151 [02:04<01:32, 446.13it/s][A
 61%|██████    | 63739/105151 [02:05<01:28, 469.32it/s][A
 61%|██████    | 63789/105151 [02:05<01:38, 419.23it/s][A
 61%|██████    | 63848/105151 [02:05<01:30, 457.74it/s][A
 61%|██████    | 63918/105151 [02:05<01:21, 508.22it/s][A
 61%|██████    | 63973/105151 [02:05<01:26, 477.47it/s][A
 61%|██████    | 64024/105151 [02:05<01:25, 483.84it/s][A
 61%|██████    | 64075/105151 [02:05<01:36, 423.91it/s][A
 61%|██████    | 64141/105151 [02:05<01:26, 472.92it/s][A
 61%|██████    | 64211/105151 [02:06<01:18, 522.97it/s][A
 61%|██████    | 64268/105151 [02:06<01:22, 496.23it/s][A
 61%|██████    | 64321/105151 [02:06<01:24, 485.07it/s]

 78%|███████▊  | 81526/105151 [02:46<00:36, 642.90it/s][A
 78%|███████▊  | 81595/105151 [02:47<00:38, 611.83it/s][A
 78%|███████▊  | 81660/105151 [02:47<00:41, 565.18it/s][A
 78%|███████▊  | 81720/105151 [02:47<00:45, 514.26it/s][A
 78%|███████▊  | 81788/105151 [02:47<00:42, 554.26it/s][A
 78%|███████▊  | 81862/105151 [02:47<00:38, 598.52it/s][A
 78%|███████▊  | 81942/105151 [02:47<00:35, 646.65it/s][A
 78%|███████▊  | 82010/105151 [02:47<00:36, 626.71it/s][A
 78%|███████▊  | 82076/105151 [02:47<00:40, 574.99it/s][A
 78%|███████▊  | 82137/105151 [02:48<00:40, 572.89it/s][A
 78%|███████▊  | 82197/105151 [02:48<00:39, 575.90it/s][A
 78%|███████▊  | 82312/105151 [02:48<00:33, 677.29it/s][A
 78%|███████▊  | 82388/105151 [02:48<00:43, 524.09it/s][A
 78%|███████▊  | 82452/105151 [02:48<00:57, 393.77it/s][A
 78%|███████▊  | 82504/105151 [02:48<01:10, 322.46it/s][A
 79%|███████▊  | 82585/105151 [02:49<00:57, 393.46it/s][A
 79%|███████▊  | 82650/105151 [02:49<00:50, 445.32it/s]

 95%|█████████▍| 99499/105151 [03:21<00:10, 547.31it/s][A
 95%|█████████▍| 99574/105151 [03:21<00:09, 592.11it/s][A
 95%|█████████▍| 99638/105151 [03:21<00:09, 559.19it/s][A
 95%|█████████▍| 99704/105151 [03:21<00:09, 583.58it/s][A
 95%|█████████▍| 99766/105151 [03:21<00:10, 534.24it/s][A
 95%|█████████▍| 99838/105151 [03:21<00:09, 575.22it/s][A
 95%|█████████▌| 99899/105151 [03:21<00:10, 522.66it/s][A
 95%|█████████▌| 99976/105151 [03:21<00:09, 572.07it/s][A
 95%|█████████▌| 100037/105151 [03:22<00:13, 384.17it/s][A
 95%|█████████▌| 100090/105151 [03:22<00:12, 417.80it/s][A
 95%|█████████▌| 100176/105151 [03:22<00:10, 493.88it/s][A
 95%|█████████▌| 100239/105151 [03:22<00:09, 527.46it/s][A
 95%|█████████▌| 100301/105151 [03:22<00:08, 547.11it/s][A
 95%|█████████▌| 100366/105151 [03:22<00:08, 574.31it/s][A
 96%|█████████▌| 100429/105151 [03:22<00:08, 567.48it/s][A
 96%|█████████▌| 100490/105151 [03:22<00:08, 564.41it/s][A
 96%|█████████▌| 100549/105151 [03:23<00:08, 561

In [2]:
path="./Data/rec"
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [None]:
df_`

In [30]:
# df_train_filtered.review_text = df_train_filtered.review_text.fillna('no text')
# df_dev.review_text = df_dev.review_text.fillna('no text')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [31]:
# df_valid.review_text = df_valid.review_text.fillna('no text')
# df_test.review_text = df_test.review_text.fillna('no text')

In [33]:
vectorizer.transform(df_train_filtered[0:10].book_idxs)
vectorizer = CountVectorizer()#ngram_range=(1, 2),max_features=10000)
# vectorizer = DictVectorizer()
X_train = vectorizer.fit_transform(df_train_filtered.book_idxs)
X_dev = vectorizer.transform(df_dev.book_idxs).toarray()
X_valid = vectorizer.transform(df_valid.book_idxs.tolist())
X_test = vectorizer.transform(df_test.book_idxs.tolist())

In [12]:
#============ our changes ==================#
def lsnork_to_l_m(lsnork, num_classes):
	m = 1 - np.equal(lsnork,-1).astype(int)
	l = m*lsnork + (1-m)*num_classes
	return l,m

In [27]:
def get_features(df):
    t = df.book_idxs.values
    u = 200#[len(i) for i in t]
    v = [np.pad(i,(0,u-len(i)),'constant') for i in t]
    return np.asarray(v)

In [28]:
import pickle

In [34]:
L_train.shape

(795499, 6)

In [30]:
d_l1

array([[-1, -1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1, -1],
       [-1, -1, -1, -1, -1, -1],
       ...,
       [-1, -1,  1, -1, -1, -1],
       [-1, -1,  1, -1, -1, -1],
       [-1, -1,  1, -1, -1, -1]])

In [27]:
np.equal(L_dev[0],-1).astype(int)

array([1, 1, 1, 1, 1, 1])

In [29]:
#============ our changes ==================#
def lsnork_to_l_m(lsnork, num_classes):
	m = 1 - np.equal(lsnork,0).astype(int)
	l = m*lsnork + (1-m)*num_classes
	return l,m
d_l1, d_m1 = lsnork_to_l_m(L_dev,2)

In [13]:
d_l, d_m = lsnork_to_l_m(L_dev,2)

In [29]:
d_x = get_features(df_dev)
# d_x = df_dev.book_idxs.values#.toarray()
d_L = df_dev.rating.values
d_l = L_dev

d_l, d_m = lsnork_to_l_m(d_l,2)
d_d = np.array([1.0] * len(d_x))
d_r = np.zeros(d_l.shape) #rule exemplar coupling unavailable


    with open(path+"/"+"d_processed.p","wb") as f:
        pickle.dump(d_x,f)
        pickle.dump(d_l,f)
        pickle.dump(d_m,f)
        pickle.dump(d_L,f)
        pickle.dump(d_d,f)
        pickle.dump(d_r,f)

In [30]:
# U_x = X_train.toarray()
U_x = get_features(df_train_filtered)# toarray()
U_L = df_train_filtered.rating.values
U_l = L_train_filtered
U_l, U_m = lsnork_to_l_m(U_l,2)
U_d = np.array([0.0] * len(U_x))
U_r = np.zeros(U_l.shape)

with open(path+"/"+"U_processed.p","wb") as f:
    pickle.dump(U_x,f)
    pickle.dump(U_l,f)
    pickle.dump(U_m,f)
    pickle.dump(U_L,f)
    pickle.dump(U_d,f)
    pickle.dump(U_r,f)

In [2]:

valid_x = get_features(df_valid)
valid_L = df_valid.rating.values
valid_l = L_valid
valid_l, valid_m = lsnork_to_l_m(valid_l,2)
valid_d = np.array([0.0] * len(valid_x))
valid_r = np.zeros(valid_l.shape) #rule exemplar coupling unavailable
with open(path+"/"+"validation_processed.p","wb") as f:
	pickle.dump(valid_x,f)
	pickle.dump(valid_l,f)
	pickle.dump(valid_m,f)
	pickle.dump(valid_L,f)
	pickle.dump(valid_d,f)
	pickle.dump(valid_r,f)


test_x = get_features(df_test)
test_L = df_test.rating.values
test_l = L_test
test_l, test_m = lsnork_to_l_m(test_l,2)
test_d = np.array([0.0] * len(test_x))
test_r = np.zeros(test_l.shape) #rule exemplar coupling unavailable
with open(path+"/"+"test_processed.p","wb") as f:
	pickle.dump(test_x,f)
	pickle.dump(test_l,f)
	pickle.dump(test_m,f)
	pickle.dump(test_L,f)
	pickle.dump(test_d,f)
	pickle.dump(test_r,f)




exit()


NameError: name 'get_features' is not defined

In [1]:
LFAnalysis(L_dev, lfs).lf_summary(df_dev.rating.values)

NameError: name 'LFAnalysis' is not defined

In [None]:
# %% [markdown]
# ### Applying labeling functions to the training set
#np.where(d_m==1)
# We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set.

# %% {"t

In [12]:
# %% [markdown]
# ### Applying labeling functions to the training set
#
# We apply the labeling functions to the training set, and then filter out data points unlabeled by any LF to form our final training set.

# %% {"tags": ["md-exclude-output"]}
from snorkel.labeling.model import LabelModel

L_train = applier.apply(df_train)
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train, n_epochs=5000, seed=123, log_freq=20, lr=0.01)
preds_train = label_model.predict(L_train)

from snorkel.labeling import filter_unlabeled_dataframe

df_train_filtered, preds_train_filtered = filter_unlabeled_dataframe(
    df_train, preds_train, L_train
)
df_train_filtered["rating"] = preds_train_filtered

In [1]:
df_tr

NameError: name 'df_train_filtered' is not defined