In [1]:
import warnings
warnings.filterwarnings('ignore')

#General Data/Plotting
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from tqdm.auto import tqdm 
import random

# Language
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
import re 
from collections import Counter
from string import punctuation

# Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_score, recall_score , f1_score, accuracy_score, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.datasets import load_iris
from sklearn.preprocessing import LabelEncoder

# DistilBERT/Tensorflow/Torch
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertForSequenceClassification, DistilBertTokenizer

from torch.utils.data import DataLoader, TensorDataset, random_split
from torch.nn import CrossEntropyLoss
from torch.optim import Adam
import torch


lemma = WordNetLemmatizer()




In [2]:
# Pandas Progress bar
tqdm.pandas()

In [3]:
df = pd.read_csv('../Datasets/Books_rating.csv')

In [4]:
df[['helpful_0', 'helpful_1']] = df['review/helpfulness'].str.split('/', expand=True)

# Konvertieren der 'helpfulness'-Spalte in numerische Werte (falls notwendig)
#df['review/helpfulness'] = pd.to_numeric(df['review/helpfulness'], errors='coerce')

df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,helpful_0,helpful_1
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,7,7
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,10,10
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t...",10,11
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",7,7
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,3,3


In [5]:
# Daten vor dem Plotten sortieren/konvertieren
df['helpful_0'] = df['helpful_0'].progress_apply(pd.to_numeric)
df['helpful_1'] = df['helpful_1'].progress_apply(pd.to_numeric)

100%|██████████| 3000000/3000000 [00:47<00:00, 62789.62it/s]
100%|██████████| 3000000/3000000 [00:47<00:00, 62563.44it/s]


In [6]:
df['helpful_0'] = df['helpful_0'].astype(int)
df['helpful_1'] = df['helpful_1'].astype(int)

In [9]:
num_rows = len(df[df['helpful_1'] >= 20])
print("Number of rows with helpful_1 >= 20: ", num_rows)

Number of rows with helpful_1 >= 20:  239553


In [10]:
# Remove reviews with not enough helpful votes
df = df[df['helpful_1'] >= 20]

In [11]:
df['helpful_ratio'] = df['helpful_0'] / df['helpful_1']

In [12]:
df['ratio_percent'] = (df['helpful_ratio'] * 100).astype(int)
df.head()

Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text,helpful_0,helpful_1,helpful_ratio,ratio_percent
47,0802841899,The Church of Christ: A Biblical Ecclesiology ...,25.97,ARI272XF8TOL4,Christopher J. Bray,74/81,5.0,955411200,Ecclesiological Milestone,With the publication of Everett Ferguson's boo...,74,81,0.91358,91
81,0974289108,The Ultimate Guide to Law School Admission: In...,14.95,A1KZ0RDJZQSY4O,sayock,27/29,3.0,1090368000,No &quot;Insider&quot; Secrets,If you are someone who is fairly new to the la...,27,29,0.931034,93
88,0809080699,The Repeal of Reticence: A History of America'...,,A18YY5TBNSDW3O,Bartleby,29/29,5.0,899164800,Great treatment of the defeat of reticence by ...,"Using a quiet, restrained writing style that i...",29,29,1.0,100
91,B000NKGYMK,Alaska Sourdough,,,,36/37,5.0,949104000,Real Alaskan Sourdough,Ruth Allman has written an excellent book abou...,36,37,0.972973,97
92,B000NKGYMK,Alaska Sourdough,,AC58Z72OB2DDX,Gary W. Marian,29/30,5.0,945734400,True Alaskan cooking,"I have been using this book since 1988, the ei...",29,30,0.966667,96


In [13]:
# Drop unused columns
df.drop(['Id', 'Title', 'Price', 'User_id', 'profileName', 'review/time', 'review/summary'], axis='columns', inplace=True)

In [14]:
df = df.rename(columns={"review/text":"Review","review/score": "Rating"})

df['review_len'] = [len(text.split()) for text in df.Review]

In [16]:
df.head()

Unnamed: 0,review/helpfulness,Rating,Review,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len
47,74/81,5.0,With the publication of Everett Ferguson's boo...,74,81,0.91358,91,999
81,27/29,3.0,If you are someone who is fairly new to the la...,27,29,0.931034,93,367
88,29/29,5.0,"Using a quiet, restrained writing style that i...",29,29,1.0,100,532
91,36/37,5.0,Ruth Allman has written an excellent book abou...,36,37,0.972973,97,153
92,29/30,5.0,"I have been using this book since 1988, the ei...",29,30,0.966667,96,63


In [17]:
len(df)

239553

In [18]:
df = df[df['review_len'] <= 300]

In [19]:
len(df)

181104

In [20]:
def convert_label(df) : 
    if df['ratio_percent'] <= 70 : 
        rate = 0 # for Negative 
    else : 
        rate = 1 # for Positive
        
    return rate

In [21]:
df['IsHelpful'] = df.apply(convert_label, axis = 1)
df.head()

Unnamed: 0,review/helpfulness,Rating,Review,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len,IsHelpful
91,36/37,5.0,Ruth Allman has written an excellent book abou...,36,37,0.972973,97,153,1
92,29/30,5.0,"I have been using this book since 1988, the ei...",29,30,0.966667,96,63,1
93,25/28,5.0,"My poor dogeared, stained copy of this book ca...",25,28,0.892857,89,206,1
127,3/20,1.0,This book in my opinion is biased and takes an...,3,20,0.15,15,92,0
139,20/20,5.0,If you're already a fan of the Eyewitness Trav...,20,20,1.0,100,234,1


Ich hab mir das hier mal als neues .csv exportiert, damit ich das nich immer wieder machen muss

In [22]:
df.to_csv('../Datasets/Modified_Books_rating.csv', index=False)

### Data Prepossessing :ghost:

ChatGPT hat bei Preprocessing nichts anderes außer Tokenizing erwähnt, mal schauen wie das läuft

In [18]:
# Tokenize the reviews
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encoded_data = tokenizer(df['Review'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='pt')


In [25]:
# Encode labels
label_encoder = LabelEncoder()
encoded_labels = torch.tensor(label_encoder.fit_transform(df['IsHelpful'].tolist()))

In [26]:
# Create DataLoader
dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], encoded_labels)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])


In [27]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [30]:
# Load pre-trained DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
optimizer = Adam(model.parameters(), lr=5e-5)
loss_fn = CrossEntropyLoss()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
# Training loop (example - adjust as needed)
epochs = 3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

Das läuft jetzt auch schon seit über 5 std, nice

Muss mal Tensorflow GPU aufsetzen maybe, aber hatte da noch keinen Nerv zu, da das über WSL etc. läuft

In [32]:
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{epochs}, Accuracy: {accuracy}')