In [54]:
import warnings
warnings.filterwarnings('ignore')

#General Data/Plotting
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from tqdm.auto import tqdm 
import random

# Language
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import word_tokenize
import re 
from collections import Counter
from string import punctuation
import textstat # pip install textstat

# Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_score, recall_score , f1_score, accuracy_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential 
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense , Embedding , Bidirectional , LSTM

lemma = WordNetLemmatizer()

In [55]:
df = pd.read_json('../Datasets/Cell_Phones_and_Accessories_5.json', lines=True)

In [56]:
df.drop(['reviewerID', 'asin', 'reviewerName','summary', 'unixReviewTime', 'reviewTime'], axis='columns', inplace=True)

In [57]:
df = df.rename(columns={"reviewText":"Review","overall": "Rating"})
df.columns

Index(['helpful', 'Review', 'Rating'], dtype='object')

In [58]:
df['review_len'] = [len(text.split()) for text in df.Review]
df.head()

Unnamed: 0,helpful,Review,Rating,review_len
0,"[0, 0]",They look good and stick good! I just don't li...,4,37
1,"[0, 0]",These stickers work like the review says they ...,5,32
2,"[0, 0]",These are awesome and make my phone look so st...,5,34
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23


##### ARI

In [59]:
def calculate_ari(text):
    ari = textstat.automated_readability_index(text)
    return ari

In [60]:
df['ARI'] = df['Review'].apply(calculate_ari)

In [61]:
df.head()

Unnamed: 0,helpful,Review,Rating,review_len,ARI
0,"[0, 0]",They look good and stick good! I just don't li...,4,37,4.2
1,"[0, 0]",These stickers work like the review says they ...,5,32,3.3
2,"[0, 0]",These are awesome and make my phone look so st...,5,34,2.7
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51,5.7
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23,5.3


In [62]:
# Annahme: df ist dein DataFrame mit der Spalte 'ari_score'

# Zähle die Anzahl der Reviews mit einem ARI-Wert von weniger als 8
low_ari_reviews = df[df['ARI'] < 8]
num_low_ari_reviews = len(low_ari_reviews)

print(f"Anzahl der Reviews mit ARI < 8: {num_low_ari_reviews}")


Anzahl der Reviews mit ARI < 8: 140418


In [63]:
low_ari_reviews = df[df['ARI'] > 8]
num_low_ari_reviews = len(low_ari_reviews)

print(f"Anzahl der Reviews mit ARI < 8: {num_low_ari_reviews}")

Anzahl der Reviews mit ARI < 8: 52378


In [64]:
def convert_ari(df):
    if df['ARI'] <= 8.0:
        ari = 1 # for readable
    else:
        ari = 0 # for not readable
    return ari

In [65]:
df['IsReadable'] = df.apply(convert_ari, axis=1)
df.head()

Unnamed: 0,helpful,Review,Rating,review_len,ARI,IsReadable
0,"[0, 0]",They look good and stick good! I just don't li...,4,37,4.2,1
1,"[0, 0]",These stickers work like the review says they ...,5,32,3.3,1
2,"[0, 0]",These are awesome and make my phone look so st...,5,34,2.7,1
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51,5.7,1
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23,5.3,1


In [66]:
df['helpful_0'] = df['helpful'].apply(func=lambda x: x[0])
df['helpful_1'] = df['helpful'].apply(func=lambda x: x[1])

In [67]:
df = df[df['helpful_1'] >= 2]
df.head()

Unnamed: 0,helpful,Review,Rating,review_len,ARI,IsReadable,helpful_0,helpful_1
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51,5.7,1,4,4
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23,5.3,1,2,3
5,"[1, 2]",These make using the home button easy. My daug...,3,23,2.1,1,1,2
7,"[1, 2]",it worked for the first week then it only char...,1,20,0.1,1,1,2
8,"[2, 3]","Good case, solid build. Protects phone all aro...",5,44,3.5,1,2,3


In [68]:
def convert_label(df) : 
    if df['ratio_percent'] <= 70 : 
        rate = 0 # for Negative 
    else : 
        rate = 1 # for Positive
        
    return rate

In [69]:
df['helpful_ratio'] = df['helpful_0'] / df['helpful_1']

In [70]:
df['ratio_percent'] = (df['helpful_ratio'] * 100).astype(int)

In [71]:
df['IsHelpful'] = df.apply(convert_label, axis = 1)
df.head()

Unnamed: 0,helpful,Review,Rating,review_len,ARI,IsReadable,helpful_0,helpful_1,helpful_ratio,ratio_percent,IsHelpful
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51,5.7,1,4,4,1.0,100,1
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23,5.3,1,2,3,0.666667,66,0
5,"[1, 2]",These make using the home button easy. My daug...,3,23,2.1,1,1,2,0.5,50,0
7,"[1, 2]",it worked for the first week then it only char...,1,20,0.1,1,1,2,0.5,50,0
8,"[2, 3]","Good case, solid build. Protects phone all aro...",5,44,3.5,1,2,3,0.666667,66,0


### Data Prepossessing :ghost:

In [72]:
def DataPrep(text) : 
    text = re.sub(r'\d+', '', text) # numbers
    text = re.sub(r'[^\w\s]', '', text) # special characters
    
    # tokenization 
    tokens = nltk.word_tokenize(text) 
    
    # remove puncs 
    punc = list(punctuation)
    words = [word for word in tokens if word not in punc]
    
    # remove stopwords 
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if not word in stop_words]
    
    # lemmatization 
    words = [lemma.lemmatize(word) for word in words]
    
    text = ' '.join(words)
    
    return text

In [73]:
# More Data Prep

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Entferne HTML-Tags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Entferne Sonderzeichen und Zahlen
    text = text.lower()  # Konvertiere in Kleinbuchstaben
    return text

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

def stem_text(tokens):
    # Porter Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in tokens]
    return stemmed_tokens

def lemmatize_text(tokens):
    # WordNet Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return lemmatized_tokens


# Full Function
def preprocess_text(text):
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    tokens = remove_stopwords(tokens)
    # Wähle entweder Stemming oder Lemmatisierung, nicht beide
    # stemmed_tokens = stem_text(tokens)
    lemmatized_tokens = lemmatize_text(tokens)
    preprocessed_text = ' '.join(lemmatized_tokens)
    return preprocessed_text

In [74]:
#df['cleaned_reviews'] = df['Review'].apply(DataPrep)

df['cleaned_reviews'] = df['Review'].apply(preprocess_text)

In [75]:
df.head()

Unnamed: 0,helpful,Review,Rating,review_len,ARI,IsReadable,helpful_0,helpful_1,helpful_ratio,ratio_percent,IsHelpful,cleaned_reviews
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,51,5.7,1,4,4,1.0,100,1,item arrived great time perfect condition howe...
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,23,5.3,1,2,3,0.666667,66,0,awesome stay look great used multiple apple pr...
5,"[1, 2]",These make using the home button easy. My daug...,3,23,2.1,1,1,2,0.5,50,0,make using home button easy daughter like woul...
7,"[1, 2]",it worked for the first week then it only char...,1,20,0.1,1,1,2,0.5,50,0,worked first week charge phone waste money
8,"[2, 3]","Good case, solid build. Protects phone all aro...",5,44,3.5,1,2,3,0.666667,66,0,good case solid build protects phone around go...


In [76]:
print(f'There are around {int(df["cleaned_reviews"].duplicated().sum())} duplicated reviews, we will remove them.')

There are around 90 duplicated reviews, we will remove them.


In [77]:
df.drop_duplicates("cleaned_reviews", inplace=True)

### Split Data

In [78]:
#x_train, x_val, y_train, y_val, read_train, read_val = train_test_split(df['cleaned_reviews'], df['IsHelpful'], df['IsReadable'], train_size=0.80, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df[['cleaned_reviews', 'IsReadable']], df['IsHelpful'], test_size=0.2, random_state=42)

In [79]:
#len(x_train), len(x_val)
len(X_train), len(X_test)

(22428, 5608)

### Feature Extraction

In [80]:
#vec = TfidfVectorizer()
#vec.fit(x_train)
#print("No. of feature words: ",len(vec.get_feature_names_out()))

# TF-IDF-Vektorisierung der 'cleaned_reviews'-Spalte
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train['cleaned_reviews'])
X_test_tfidf = vectorizer.transform(X_test['cleaned_reviews'])

In [81]:
#x_train = vec.transform(x_train).toarray()
#x_val = vec.transform(x_val).toarray()

# Hinzufügen der 'IsReadable'-Spalte zu den TF-IDF-Matrizen
X_train_final = pd.concat([pd.DataFrame(X_train_tfidf.toarray()), X_train['IsReadable'].reset_index(drop=True)], axis=1)
X_test_final = pd.concat([pd.DataFrame(X_test_tfidf.toarray()), X_test['IsReadable'].reset_index(drop=True)], axis=1)


In [82]:
#x_train.shape , x_val.shape

X_train_final.shape, X_test_final.shape

((22428, 78844), (5608, 78844))

In [83]:
# Add 'ARI' feature to the TF-IDF transformed data
#x_train_combined = np.column_stack((x_train, read_train))
#x_val_combined = np.column_stack((x_val, read_val))

In [84]:
#x_train_combined.shape, x_val_combined.shape

In [85]:
#Feature Extraction on combined array
#vec = TfidfVectorizer()
#vec.fit(x_train_combined)
#print("No. of feature words: ",len(vec.get_feature_names_out()))

### Logistic Regret

In [86]:
# Konvertiere die Spaltennamen in Strings
X_train_final.columns = X_train_final.columns.astype(str)

In [87]:
# Model mit ARI
#lr = LogisticRegression(random_state=42)
#lr.fit(x_train_combined, y_train)

model = LogisticRegression(random_state=42)
model.fit(X_train_final, y_train)

In [88]:
#train_acc1 = lr.score(x_train_combined , y_train)

# Konvertiere die Spaltennamen in Strings
X_test_final.columns = X_test_final.columns.astype(str)

# Vorhersagen auf dem Testset
y_pred = model.predict(X_test_final)

In [89]:
# Auswertung der Ergebnisse
accuracy = accuracy_score(y_test, y_pred)
classification_report_result = classification_report(y_test, y_pred)

print(f'Genauigkeit (Accuracy): {accuracy}')
print('Classification Report:')
print(classification_report_result)

Genauigkeit (Accuracy): 0.6207203994293866
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.40      0.48      2393
           1       0.64      0.78      0.70      3215

    accuracy                           0.62      5608
   macro avg       0.61      0.59      0.59      5608
weighted avg       0.61      0.62      0.61      5608



In [90]:
#lr_pred = lr.predict(x_val_combined)

#val_acc1 = accuracy_score(y_val , lr_pred) 

#val_precision1 = precision_score(y_val , lr_pred , average='weighted')
#val_recall1 = recall_score(y_val , lr_pred , average='weighted')
#val_f1score1 = f1_score(y_val , lr_pred , average='weighted')

In [91]:
#print(f"The training accuracy for logistic regression : {(train_acc1*100):0.2f}%\n")
#print(f"The validation accuracy for logistic regression : {(val_acc1*100):0.2f}%\n")
#print(f"The precision for logistic regression : {val_precision1:0.2f}\n")
#print(f"The recall for logistic regression : {val_recall1:0.2f}\n")
#print(f"The f1 score for logistic regression : {val_f1score1:0.2f}\n")

### Random Forest

In [96]:
rf = RandomForestClassifier()
rf.fit(X_train_final, y_train)

In [97]:
#train_acc2 = rf.score(x_train_combined , y_train)

# Vorhersagen auf dem Testset
y_pred = rf.predict(X_test_final)

In [98]:
# Evaluierungsmetriken
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", classification_rep)

Accuracy: 0.6137660485021398
Confusion Matrix:
 [[ 719 1674]
 [ 492 2723]]
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.30      0.40      2393
           1       0.62      0.85      0.72      3215

    accuracy                           0.61      5608
   macro avg       0.61      0.57      0.56      5608
weighted avg       0.61      0.61      0.58      5608



In [94]:
#rf_pred = rf.predict(x_val_combined)

#val_acc2 = accuracy_score(y_val , rf_pred) 

#val_precision2 = precision_score(y_val , rf_pred , average='weighted')
#val_recall2 = recall_score(y_val , rf_pred , average='weighted')
#val_f1score2 = f1_score(y_val , rf_pred , average='weighted')

In [95]:
#print(f"The training accuracy for Random Forest : {(train_acc2*100):0.2f}%\n")
#print(f"The validation accuracy for Random Forest : {(val_acc2*100):0.2f}%\n")
#print(f"The precision for Random Forest : {val_precision2:0.2f}\n")
#print(f"The recall for Random Forest : {val_recall2:0.2f}\n")
#print(f"The f1 score for Random Forest : {val_f1score2:0.2f}\n")