In [2]:
from transformers import pipeline
import pandas as pd
from tqdm.notebook import tqdm
tqdm.pandas()

In [4]:
electronics_df = pd.read_csv("../data/processed/electronics.csv")

  electronics_df = pd.read_csv("../data/processed/electronics.csv")


In [5]:
electronics_df["index"] = electronics_df.index
electronics_df.head(2)

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,index
0,5.0,True,"07 17, 2002",A1N070NS9CJQ2I,60009810,{'Format:': ' Hardcover'},Teri Adams,this was the first time i read garcia-aguilera...,Hit The Spot!,1026864000,,,0
1,4.0,True,"08 5, 2007",A1X7HRXX3BJXQ8,60786817,{'Format:': ' Hardcover'},a reader,extreme warning: unless your computer has upda...,Suckers Always Pay For Beauty----One Way or An...,1186272000,,,1


## Load the transformers pipeline
- Load the models by cd into models folder and running:
- git lfs clone https://huggingface.co/facebook/bart-large-mnli/

In [4]:
from transformers.pipelines.zero_shot_classification import ZeroShotClassificationPipeline


class CustomZeroShotClassificationPipeline(ZeroShotClassificationPipeline):
    def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
        self.idx = inputs["index"]
        inputs = inputs["reviewText"]
        return super().preprocess(inputs, candidate_labels, hypothesis_template)

    def postprocess(self, model_outputs, multi_label=False):
        op = super().postprocess(model_outputs, multi_label)
        op["idx"] = self.idx
        return op


In [5]:
classifier = pipeline("zero-shot-classification",
                      model="../models/bart-large-mnli",
                      device="cuda:0",
                      pipeline_class=CustomZeroShotClassificationPipeline)


In [6]:
# convert electronics_df to a pytorch dataset
from torch.utils.data import Dataset, DataLoader


class AmazonDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.df.iloc[idx]


electronics_dataset = AmazonDataset(electronics_df.sample(10000))


In [7]:
candidate_labels = ["positive_review", "negative_review", "neutral_review"]

In [8]:
output = []
for out in tqdm(classifier(electronics_dataset, candidate_labels, multiclass=False, batch_size=10)):
    op_dict = {}
    for i, label in enumerate(out["labels"]):
        op_dict[label] = out["scores"][i]
    op_dict["idx"] = out["idx"]
    output.append(op_dict)

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
op_df = pd.DataFrame(output)

In [None]:
op_df

Unnamed: 0,positive_review,neutral_review,negative_review,idx
0,0.728096,0.201501,0.070403,2


In [None]:
op_df.to_csv("../data/processed/electronics_zero_shot.csv", index=False)

In [10]:
merged_df = pd.read_csv("../data/processed/electronics_zero_shot_merged.csv")

In [11]:
merged_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image,idx,positive_review,neutral_review,negative_review,id
0,5.0,True,"09 4, 2015",A7ZSEBCEXH9Y0,0972683275,,tooltimetim7,Awesome product! low profile and easy to insta...,Five Stars,1441324800,,,2362,0.9211,0.074023,0.004876,2362
1,4.0,True,"03 4, 2015",A23XV9P6VVHJUG,0972683275,,Stephanie,"We used this for our 40"" Sony Bravia. The moun...","We used this for our 40"" Sony Bravia. The ...",1425427200,,,2456,0.814065,0.162815,0.02312,2456
2,2.0,True,"12 23, 2014",A38GUMQBDV5HWL,106171327X,,KAG,SanDisk quality has really went down hill and ...,Buy something else....quality control issues,1419292800,3.0,,4521,0.117972,0.24413,0.637898,4521
3,5.0,True,"07 8, 2016",A3BBR7B3LUB0SL,1495443043,{'Format:': ' Kindle Edition'},Joanne Gibson,I thoroughly enjoyed reading the book.,Five Stars,1467936000,,,8873,0.87818,0.107296,0.014524,8873
4,5.0,True,"03 12, 2015",AWDKMTNCWW8IX,1495443043,{'Format:': ' Kindle Edition'},Tammy Rasche,"Loved it.,",Five Stars,1426118400,,,9986,0.951352,0.045861,0.002786,9986
