In [28]:
import pandas as pd

df=pd.read_csv('../artifacts/allReviews.csv')
df = df[df["Comment"].notnull() & df["Rating"].notnull()]
df = df[df["Comment"].notnull() | df["Rating"].notnull()]
df = df[["Comment", "Rating"]]
df_1 = df[df["Rating"]==1]
df_2 = df[df["Rating"]==2]
df_3 = df[df["Rating"]==3]
df_4 = df[df["Rating"]==4].sample(n=16000,random_state=42)
df_5 = df[df["Rating"]==5].sample(n=16000,random_state=42)

df_balanced = pd.concat([df_1,df_2,df_3,df_4,df_5])
df = df_balanced.sample(frac=1,random_state=42).reset_index(drop=True)

In [29]:
df

Unnamed: 0,Comment,Rating
0,good,4.0
1,Interesting introduction to the subject matter...,3.0
2,I have learned the basic concepts of different...,5.0
3,the course is really good but there are issues...,3.0
4,"Not much as I was expected, not very structure...",2.0
...,...,...
95948,"Overly-remedial information, but that is not w...",1.0
95949,Content is quite interesting. Timing of the le...,3.0
95950,This subject should be taught as a course in t...,4.0
95951,I cannot get my certificate as ID could not be...,1.0


In [30]:
print(df["Rating"].value_counts())


Rating
3.0    33718
1.0    16337
4.0    16000
5.0    16000
2.0    13898
Name: count, dtype: int64


In [31]:
df["Sentiment"]=df["Rating"].map({
    1:0,
    2:0,
    3:1,
    4:2,
    5:2
})

In [32]:
df

Unnamed: 0,Comment,Rating,Sentiment
0,good,4.0,2
1,Interesting introduction to the subject matter...,3.0,2
2,I have learned the basic concepts of different...,5.0,2
3,the course is really good but there are issues...,3.0,2
4,"Not much as I was expected, not very structure...",2.0,0
...,...,...,...
95948,"Overly-remedial information, but that is not w...",1.0,0
95949,Content is quite interesting. Timing of the le...,3.0,2
95950,This subject should be taught as a course in t...,4.0,2
95951,I cannot get my certificate as ID could not be...,1.0,0


In [33]:
print(df["Sentiment"].value_counts())

Sentiment
2    65718
0    30235
Name: count, dtype: int64


### Remove URL

In [34]:
import re
url_pattern = r'http\S+|www\S+|https\S+'

def remove_urls_batch(text_series):
    return text_series.str.replace(url_pattern, '', regex=True)

In [35]:
df['clean_Comment'] = df['Comment'].apply(lambda x: re.sub(url_pattern, '', x, flags=re.IGNORECASE))

In [36]:
# df = df.drop(columns=["contains_url"])
# df = df.drop(columns=["clean_text"])

In [37]:
df

Unnamed: 0,Comment,Rating,Sentiment,clean_Comment
0,good,4.0,2,good
1,Interesting introduction to the subject matter...,3.0,2,Interesting introduction to the subject matter...
2,I have learned the basic concepts of different...,5.0,2,I have learned the basic concepts of different...
3,the course is really good but there are issues...,3.0,2,the course is really good but there are issues...
4,"Not much as I was expected, not very structure...",2.0,0,"Not much as I was expected, not very structure..."
...,...,...,...,...
95948,"Overly-remedial information, but that is not w...",1.0,0,"Overly-remedial information, but that is not w..."
95949,Content is quite interesting. Timing of the le...,3.0,2,Content is quite interesting. Timing of the le...
95950,This subject should be taught as a course in t...,4.0,2,This subject should be taught as a course in t...
95951,I cannot get my certificate as ID could not be...,1.0,0,I cannot get my certificate as ID could not be...


In [38]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch

In [39]:
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [43]:
df.head()

Unnamed: 0,Comment,Rating,Sentiment,clean_Comment
0,good,4.0,2,good
1,Interesting introduction to the subject matter...,3.0,2,Interesting introduction to the subject matter...
2,I have learned the basic concepts of different...,5.0,2,I have learned the basic concepts of different...
3,the course is really good but there are issues...,3.0,2,the course is really good but there are issues...
4,"Not much as I was expected, not very structure...",2.0,0,"Not much as I was expected, not very structure..."


In [44]:
df.to_csv("./trim_and_clean_data.csv")

In [48]:
df.to_csv("./trim_and_clean_data.csv", header=True)

In [49]:
df_1

Unnamed: 0.1,Unnamed: 0,Comment,Rating,Sentiment,clean_Comment
0,0,good,4.0,2,good
1,1,Interesting introduction to the subject matter...,3.0,2,Interesting introduction to the subject matter...
2,2,I have learned the basic concepts of different...,5.0,2,I have learned the basic concepts of different...
3,3,the course is really good but there are issues...,3.0,2,the course is really good but there are issues...
4,4,"Not much as I was expected, not very structure...",2.0,0,"Not much as I was expected, not very structure..."
...,...,...,...,...,...
95948,95948,"Overly-remedial information, but that is not w...",1.0,0,"Overly-remedial information, but that is not w..."
95949,95949,Content is quite interesting. Timing of the le...,3.0,2,Content is quite interesting. Timing of the le...
95950,95950,This subject should be taught as a course in t...,4.0,2,This subject should be taught as a course in t...
95951,95951,I cannot get my certificate as ID could not be...,1.0,0,I cannot get my certificate as ID could not be...


In [42]:
X = list(df["clean_Comment"])
y = list(df["Sentiment"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

KeyboardInterrupt: 

In [None]:
X_train_tokenized.keys()

In [None]:
len(X_train),len(X_val)

In [None]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

### Accuracy scoring

In [None]:
def compute_metrics(p):
    print(type(p))
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:

from transformers import TrainingArguments, Trainer
# Define Trainer
args = TrainingArguments(
    output_dir="output",
    logging_strategy="epoch",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [None]:
df.to_csv("./trim_and_clean_data.csv")

In [None]:
trainer.train()