In [1]:
import warnings
warnings.filterwarnings('ignore')

#General Data/Plotting
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from tqdm.auto import tqdm 
import random

# Language
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from nltk import pos_tag
from nltk.tokenize import word_tokenize

import re 
from collections import Counter
from string import punctuation

# Modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.metrics import precision_score, recall_score , f1_score, accuracy_score,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential 
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense , Embedding , Bidirectional , LSTM

lemma = WordNetLemmatizer()

In [2]:
# read dataset and save it into df variable
df = pd.read_json('../Datasets/Cell_Phones_and_Accessories_5.json', lines=True)

In [3]:
# rename columns for better understanding
df = df.rename(columns={"reviewText":"Review","overall": "Rating","summary":"Title"})

# drop unessesary data
df.drop(['reviewerID', 'asin', 'reviewerName', 'unixReviewTime', 'reviewTime'], axis='columns', inplace=True)

# split the helpful tuple into two new rows
df['helpful_0'] = df['helpful'].apply(func=lambda x: x[0])
df['helpful_1'] = df['helpful'].apply(func=lambda x: x[1])

# Remove Reviews that have less than 2 helpful votes
df = df[df['helpful_1'] >= 2]

# add calculation of helpfullness ration and convert it to an int percentage
df['helpful_ratio'] = df['helpful_0'] / df['helpful_1']
df['ratio_percent'] = (df['helpful_ratio'] * 100).astype(int)

# calculate length of reviews in new column
df['review_len'] = [len(text.split()) for text in df.Review]

In [4]:
def convert_label(df) : 
    if df['ratio_percent'] <= 70 : 
        rate = 0 # for Negative 
    else : 
        rate = 1 # for Positive
        
    return rate

In [5]:
df['IsHelpful'] = df.apply(convert_label, axis = 1)
df.head()

Unnamed: 0,helpful,Review,Rating,Title,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len,IsHelpful
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,4,4,1.0,100,51,1
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,2,3,0.666667,66,23,0
5,"[1, 2]",These make using the home button easy. My daug...,3,Cute,1,2,0.5,50,23,0
7,"[1, 2]",it worked for the first week then it only char...,1,not a good Idea,1,2,0.5,50,20,0
8,"[2, 3]","Good case, solid build. Protects phone all aro...",5,Solid Case,2,3,0.666667,66,44,0


In [6]:
def DataPrep(text) : 
    text = re.sub(r'\d+', '', text) # numbers
    text = re.sub(r'[^\w\s]', '', text) # special characters
    
    # tokenization 
    tokens = nltk.word_tokenize(text) 
    
    # remove puncs 
    punc = list(punctuation)
    words = [word for word in tokens if word not in punc]
    
    # remove stopwords 
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if not word in stop_words]
    
    # lemmatization 
    words = [lemma.lemmatize(word) for word in words]
    
    text = ' '.join(words)
    
    return text

In [7]:
## achtung dauert so 10 Min
# calculate number of adjectives per review
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

def count_adjectives(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stopwords.words('english')]
    pos_tags = pos_tag(words)
    adjectives = [word for word, tag in pos_tags if tag.startswith('JJ')]
    return len(adjectives)

df['Adjective_Count'] = df['Review'].apply(count_adjectives)
df['Adjective_Ratio'] = df['Adjective_Count'] / df['review_len']



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/laurareimann/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/laurareimann/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/laurareimann/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# calculate the adjective percentage
df['Adjective_Ratio'] = df['Adjective_Ratio'].fillna(0)  # Replace NaN with 0 or any other suitable default value
df['adjective_percent'] = (df['Adjective_Ratio'] * 100).astype(int)

df.head()

Unnamed: 0,helpful,Review,Rating,Title,helpful_0,helpful_1,helpful_ratio,ratio_percent,review_len,IsHelpful,Adjective_Count,Adjective_Ratio,adjective_percent
3,"[4, 4]",Item arrived in great time and was in perfect ...,4,Cute!,4,4,1.0,100,51,1,5,0.098039,9
4,"[2, 3]","awesome! stays on, and looks great. can be use...",5,leopard home button sticker for iphone 4s,2,3,0.666667,66,23,0,3,0.130435,13
5,"[1, 2]",These make using the home button easy. My daug...,3,Cute,1,2,0.5,50,23,0,2,0.086957,8
7,"[1, 2]",it worked for the first week then it only char...,1,not a good Idea,1,2,0.5,50,20,0,1,0.05,5
8,"[2, 3]","Good case, solid build. Protects phone all aro...",5,Solid Case,2,3,0.666667,66,44,0,5,0.113636,11
