## Pipelines API

### Importing the Pipeline

### Select a Task

### Load the Pipeline

### Run the Pipeline

In [2]:
# --------------------
# Import pipeline
# --------------------
from transformers import pipeline

# --------------------
# Create the task pipeline
# --------------------
task_pipeline = pipeline(task="sentiment-analysis")

# --------------------
# Create the model pipeline
# --------------------
model_pipeline = pipeline(model="distilbert-base-uncased-finetuned-sst-2-english")

text = "this a non sentence. I am not sure what to do with it."

# --------------------
# Predict the sentiment
# --------------------
task_output = task_pipeline(text)
model_output = model_pipeline(text)

print(f"Sentiment from task_pipeline: {task_output[0]['label']}; Sentiment from model_pipeline: {model_output[0]['label']}")

  from .autonotebook import tqdm as notebook_tqdm
2024-03-20 15:06:49.879491: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  torch.utils._pytree._register_pytree_node(
No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
config.json: 100%|██████████| 629/629 [00:00<00:00, 88.5kB/s]
model.safetensors: 100%|██████████| 268M/268M [00:32<00:00, 8.20MB/s] 
tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 9.38kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.96MB/s]


Sentiment from task_pipeline: NEGATIVE; Sentiment from model_pipeline: NEGATIVE


### Multiple Sentences

In [3]:
text1 = ["this a non sentence. I am not sure what to do with it.", "I am happy to be here"]

# Predict the sentiment of multiple sentences
task_output = task_pipeline(text1)
model_output = model_pipeline(text1)

import pandas as pd
# convert the output to a dataframe including the text
df = pd.DataFrame(task_output)
df["text"] = text1
print(df)



      label     score                                               text
0  NEGATIVE  0.999614  this a non sentence. I am not sure what to do ...
1  POSITIVE  0.999875                              I am happy to be here


### Using a different model other than the default model

In [4]:
distilbert_pipeline = pipeline(model="distilbert-base-uncased", task="sentiment-analysis")
distilbert_output = distilbert_pipeline(text1)
df_2 = pd.DataFrame(distilbert_output)
df_2["text"] = text1


# -------------------
# And another model 
# -------------------
bert_pipeline = pipeline(model="kwang123/bert-sentiment-analysis", task="sentiment-analysis")
bert_output = bert_pipeline(text1)
df_3 = pd.DataFrame(bert_output)
df_3["text"] = text1


print(df_2)
print(df_3)


config.json: 100%|██████████| 483/483 [00:00<00:00, 76.2kB/s]
model.safetensors: 100%|██████████| 268M/268M [00:19<00:00, 14.0MB/s] 
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 4.32kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 12.4MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 11.4MB/s]
config.json: 100%|██████████| 1.00k/1.00k [00:00<00:00, 604kB/s]
  torch.utils._pytree._register_pytree_node(
pytorch_model.bin: 100%|██████████| 438M/438M [00:34<00:00, 12.8MB/s] 
tokenizer_config.json: 100%|██████████| 1.19k/1.19k [00:00<00:00, 158kB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 25.

     label     score                                               text
0  LABEL_1  0.514635  this a non sentence. I am not sure what to do ...
1  LABEL_1  0.504306                              I am happy to be here
                label     score  \
0            Negative  0.972048   
1  Extremely Positive  0.995910   

                                                text  
0  this a non sentence. I am not sure what to do ...  
1                              I am happy to be here  





### Import a small random subset of the sentinments df (1000 reviews)

### Apply Sentiment analysis on 10, 100 and then 1000 reviews and extrapolate the time it would take for a large (1M) number of reviews

### Find another model on the portal and repeat the above step

In [1]:
import pandas as pd
import json

# Replace these with the paths to your actual JSON files
file_path1 = '../A3/Clothing_Shoes_and_Jewelry_5.json'
file_path2 = 'reviews_Tools_and_Home_Improvement_5.json'

# Load the JSON files into DataFrames

data1 = []
with open(file_path1, 'r') as f:
        for line in f:
            data1.append(json.loads(line))
            
df1 = pd.DataFrame(data1)
data2 = []
with open(file_path2, 'r') as f:
        for line in f:
            data2.append(json.loads(line))
df2 = pd.DataFrame(data2)

# Add a new category column to each DataFrame
df1['category'] = 'Category1'  # Name this category according to what it represents
df2['category'] = 'Category2'  # Name this category accordingly

# Concatenate the DataFrames along the rows (axis=0)
combined_df = pd.concat([df1, df2], axis=0)

# Set a fixed seed and sample 1000 rows
random_seed = 42
if len(combined_df) >= 1000:
    sample_df = combined_df.sample(n=1000, random_state=random_seed)
else:
    print(f"The combined file has only {len(combined_df)} rows. Sampling the entire DataFrame.")
    sample_df = combined_df

print(sample_df)

            reviewerID        asin                      reviewerName helpful  \
102773  A1M7VLUUTO1NRL  B004GSYAIA                   Amazon Customer  [0, 0]   
52396   A1MEXN4Q7ZPPM5  B000O7HHRS                    JON DOE, TEXAS  [0, 0]   
48382   A2OX2RGNCJDDQN  B000XRCMYW        Karen C. Hogan "Book Diva"  [0, 0]   
27286   A1CIMOIAZUR7CP  B000GX1TPQ                        ChristieL.  [0, 0]   
55577    A25TYZ1P8UWXR  B001618J50                       R. Folkerts  [2, 2]   
...                ...         ...                               ...     ...   
56616   A33OF5NLPRZ5YT  B0017U1MJU  M. Bishopski "Lost in the Woods"  [1, 1]   
236565   AQOFLJ2IOMOO8  B00ABS67T4                Sipeish "SD mamma"  [2, 2]   
99472   A2OECBTX7KM6FJ  B0043WVTNI                     NW Washington  [1, 1]   
74901   A1YETX467CJV00  B0020HRYYG                          P. Smith  [0, 0]   
35415   A29GWIJL72GXXZ  B000COYDU2                               Jon  [0, 0]   

                                       

In [26]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-uncased"  # Example model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
distilbert_pipeline = pipeline(model="distilbert-base-uncased", task="sentiment-analysis", truncation=True, max_length=512)
def apply_tokenizer(text):
	output = distilbert_pipeline(text)
	# print(output)
	return output
	# inputs = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")
	# output = model(**inputs)
	# print(inputs)
	# return output
# text = "your very long text"  # Replace with your actual text

# # Tokenize and truncate the input text
# inputs = tokenizer(text, truncation=True, max_length=512, return_tensors="pt")

# # Run the model
# output = model(**inputs)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [27]:
import time

# Mock function and data for demonstration
def analyze_sentiment(df):
	# df['sentiment'] = df['reviewText'].apply(lambda x: task_pipeline(x)[0])
	# df['sentiment'] = df['reviewText'].apply(lambda x: model_pipeline(x)[0])
	# df['sentiment'] = df['reviewText'].apply(lambda x: bert_pipeline(x)[0])
	df['sentiment'] = df['reviewText'].apply(lambda x: apply_tokenizer(x)[0])

	# The result is a dictionary with keys 'label' and 'score', you can split these into separate columns if needed
	df['sentiment_label'] = df['sentiment'].apply(lambda x: x['label'])
	df['sentiment_score'] = df['sentiment'].apply(lambda x: x['score'])

# Generate dummy reviews for demonstration
reviews = ["This product is great!" for _ in range(1000000)]  # Assume 1 million reviews

for n in [10, 100, 1000]:
    start_time = time.time()
    results = analyze_sentiment(sample_df.sample(n=n, random_state=7))
    end_time = time.time()
    time_taken = end_time - start_time
    print(f"Time taken for {n} reviews: {time_taken} seconds")

    # Extrapolate to 1 million reviews
    extrapolated_time = (time_taken / n) * 1000000
    print(f"Estimated time for 1 million reviews: {extrapolated_time / 60} minutes")


Time taken for 10 reviews: 0.5266578197479248 seconds
Estimated time for 1 million reviews: 877.763032913208 minutes
Time taken for 100 reviews: 3.5510311126708984 seconds
Estimated time for 1 million reviews: 591.838518778483 minutes
Time taken for 1000 reviews: 37.16049027442932 seconds
Estimated time for 1 million reviews: 619.341504573822 minutes
