In [None]:
!pip install evaluate
!pip install datasets
!pip install git+https://github.com/huggingface/accelerate
!pip install --upgrade transformers

In [None]:
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
dataset=pd.read_csv("/kaggle/input/flipkart-product-customer-reviews-dataset/Dataset-SA.csv")
dataset.head(2)

In [None]:
"""dropping product name and price as it does not affect the sentiment of the product,
sentiment only depends on customer view on the product , so we will be using data from customers"""
df = dataset[["Rate","Review","Summary","Sentiment"]]
df.head()

In [None]:
df.isna().any()

In [None]:
df.fillna("",inplace=True)
df.isna().any()

In [None]:
df["Rate"].value_counts()

In [None]:
df["Sentiment"].value_counts()

In [None]:
df.drop(df.loc[~df['Rate'].isin(["1","2","3","4","5"])].index, inplace=True)
df["Rate"].value_counts()

In [None]:
df[df["Rate"]>"4"].Sentiment.value_counts()

In [None]:
df["Rate"]=list(map(float,df["Rate"].to_list()))

In [None]:
df["label"]=df["Sentiment"]


In [None]:
df[:50000].label.value_counts()

In [None]:
from copy import deepcopy
temp_df = deepcopy(df)

sampled_df = []

n_data_points = 50000

while n_data_points>0:
    n_samples_per_class = max(n_data_points // temp_df['label'].nunique(),1)
    sampled_df.append(
    pd.concat( [group.sample(n=min(len(group),n_samples_per_class)) for _, group in temp_df.groupby("label")] )
    )
    n_data_points = n_data_points - len(sampled_df[-1])
    temp_df.drop(sampled_df[-1].index,errors="raise",inplace=True)

sampled_df = pd.concat(sampled_df)

In [None]:
sampled_df.label.value_counts()

In [None]:
sampled_df = sampled_df.sample(frac=1, random_state=545)

In [None]:
len(sampled_df)

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(sampled_df)
print(dataset[0])


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
def preprocess_function(examples):
    tokenized_feat = tokenizer(examples["Review"],examples["Summary"],return_token_type_ids=True,truncation=True)
    return tokenized_feat
    # .to(device)


In [None]:
tokenizer.sep_token_id

In [None]:
data_emb = dataset.map(preprocess_function
                       , batched=True)

In [None]:
len(data_emb)

In [None]:
data_emb[0]["label"]

In [None]:
id2label = {0: "positive", 1: "negative",2:"neutral"}
label2id = {"positive": 0, "negative": 1, "neutral":2}

In [None]:
data_emb = data_emb.map(lambda x : {"label":label2id[x["label"]]})

In [None]:
data_emb[0]["label"]

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy_result = accuracy.compute(predictions=predictions, references=labels)
    precision_result = precision.compute(predictions=predictions, references=labels, average="macro")
    recall_result = recall.compute(predictions=predictions, references=labels, average="macro")

    return {
        "accuracy": accuracy_result["accuracy"]
    }
    # Return all metrics as a dictionary
    return {
        "accuracy": accuracy_result["accuracy"],
        "precision": precision_result["precision"],
        "recall": recall_result["recall"],
    }


from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased",num_labels=3, id2label=id2label, label2id=label2id, device_map=device)

In [None]:
tokenized_data=data_emb.train_test_split(test_size=0.3)

In [None]:
training_args = TrainingArguments(
    output_dir="./sentiment_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    overwrite_output_dir=True
)

In [None]:
from transformers import EarlyStoppingCallback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
trainer.train()

Test inference

In [None]:
import torch
with torch.no_grad():
    logits = model(**tokenizer("good","worth for money",padding="max_length",truncation=True,return_tensors="pt").to(device)).logits
logits

In [None]:
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]