# Social Media Computing: Assignment

## Title: Apple Iphone Reviews Sentiment Analysis

| Members                         |Student ID  |
|---------------------------------|------------|
| Aqra Alisa binti Rashidi        | 1211103093 |
| Nurul Aqilah binti Mohd Shariff | 1211103097 |


### 1. Data Cleaning

In [None]:
import sys

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')

    csv_path = '/content/drive/MyDrive/iphone.csv'
else:
    csv_path = 'iphone.csv'

In [None]:
import pandas as pd
df = pd.read_csv(csv_path)
df.head()

In [None]:
df.shape

In [None]:
# check for any missing values

df.isna().sum()

In [None]:
# remove missing values

df = df.dropna()
df.isna().sum()

In [None]:
# check for any dupllicates observations

df.duplicated().sum()

In [None]:
# remove unnecessary columns

print(df['reviewUrl'].unique())

In [None]:
print(df['reviewedIn'].unique())

In [None]:
print(df['variantAsin'].unique())

In [None]:
print(df['productAsin'].unique())

In [None]:
df = df.drop(['reviewUrl', 'reviewedIn', 'variantAsin', 'productAsin'], axis=1)

In [None]:
# combine columns (reviewTitle & reviewDescription)

df['review'] = df['reviewTitle'] + ' ' + df['reviewDescription']
df = df.drop(['reviewTitle', 'reviewDescription'], axis=1)

print(df['review'])

In [None]:
print(df['variant'].unique())

In [None]:
# extract color and size from variant into 2 new columns

import re

def extract_color(variant):
    match = re.search(r'(?:Color|Colour):\s*([A-Za-z\s()]+)', variant, re.IGNORECASE)
    return match.group(1).strip().title() if match else None

def extract_size(variant):
    match = re.search(r'Size:\s*(\d+\s*GB)', variant, re.IGNORECASE)
    return re.sub(r'\s+', '', match.group(1)) if match else None

df['color'] = df['variant'].apply(extract_color)
df['size'] = df['variant'].apply(extract_size)

df['color'] = df['color'].str.strip().str.title()
df['size'] = df['size'].str.replace(' ', '').str.upper()

In [None]:
# clean color col

import re

def clean_color(color):
    if pd.isna(color):
        return None
    color = re.sub(r'\(Product\)', '', color, flags=re.IGNORECASE)
    color = re.sub(r'size', '', color, flags=re.IGNORECASE)
    color = re.sub(r'\s+', ' ', color)  # replace multiple spaces with one
    return color.strip().title()

df['color'] = df['color'].apply(clean_color)

In [None]:
df["raw_text"] = df["review"]

In [None]:
# assign sentiment label

def label_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating == 3:
        return "neutral"
    else:
        return "negative"

df["sentiment"] = df["ratingScore"].apply(label_sentiment)

In [None]:
df.head()

### 2. Data Pre-processing

- Lowercase texts
- Remove punctuation & Special Characters
- Tokenization
- Stopword Removal
- Stemming
- Rejoin Tokens

In [None]:
pip install nltk

In [None]:
# lowercase

df['cleaned_review'] = df['review'].str.lower()
print(df['cleaned_review'])

In [None]:
# remove punctuation and special characters

import re
df['cleaned_review'] = df['cleaned_review'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
print(df['cleaned_review'])

In [None]:
# tokenization

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt_tab')

df['tokens'] = df['cleaned_review'].apply(word_tokenize)
print(df['tokens'])

In [None]:
# stopword removal

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(lambda tokens: [w for w in tokens if w not in stop_words])
print(df['tokens'])

In [None]:
# stemming

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

df['tokens'] = df['tokens'].apply(lambda tokens: [stemmer.stem(w) for w in tokens])
print(df['tokens'])

In [None]:
# rejoin tokens into 'cleaned_review' and drop 'tokens'

df['cleaned_review'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))
df  = df.drop(['tokens'], axis=1)

In [None]:
df.head()

### 3. Data Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["sentiment_encoded"] = le.fit_transform(df['sentiment'])  # will be 0=negative, 1=neutral, 2=positive
print(le.classes_)

### 4. Data Splitting

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned_review'])
y = df['sentiment_encoded']

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 5. Traditional ML Models

#### 5.1 Logistic Regression

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression


# logistic Regression

lr = LogisticRegression(max_iter=1000)
lr1= lr.fit(X_train, y_train)
y_pred_lr = lr1.predict(X_test)
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))

conf_mat_lr = confusion_matrix(y_test, y_pred_lr)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_lr).plot();

#### 5.2 Logistic Regression Tuning

In [None]:
# !! run once and restart once !!
!pip install numpy==1.26.4

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV

# logistic regression with hyperparameter tuning
C = np.logspace(0, 4, num=10)
penalty = ['l1', 'l2']
solver = ['liblinear', 'saga']
hyperparameters = dict(C=C, penalty=penalty, solver=solver)

randomizedsearch = RandomizedSearchCV(lr, hyperparameters)
lr2 = randomizedsearch.fit(X_train, y_train)
best_lr = lr2.best_estimator_
y_pred_lr2 = lr2.predict(X_test)
print("Logistic Regression After Tuning:")
print(classification_report(y_test, y_pred_lr2))

conf_mat_lr2 = confusion_matrix(y_test, y_pred_lr2)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_lr2).plot();

#### 5.3 Support Vector Machine

In [None]:
from sklearn.svm import SVC

# support vector machine

svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("SVM:")
print(classification_report(y_test, y_pred_svm))

conf_mat_svm = confusion_matrix(y_test, y_pred_svm)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_svm).plot();

#### 5.4 Support Vector Machine Tuning

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# svm with hyperparameter tuning

param_grid = {'C': [0.1, 1, 10],'kernel': ['linear', 'rbf', 'poly'],'gamma': ['scale', 'auto']
}

grid_svm= GridSearchCV(estimator=svm, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=2, n_jobs=-1)
grid_svm.fit(X_train, y_train)

best_svm = grid_svm.best_estimator_
y_pred_svm2 = best_svm.predict(X_test)

print("SVM After Tuning:")
print(classification_report(y_test, y_pred_svm2))

conf_mat_svm2 = confusion_matrix(y_test, y_pred_svm2)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_svm2).plot();

#### 5.5 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# random forest

rf = RandomForestClassifier()
rf1 = rf.fit(X_train, y_train)
y_pred_rf1 = rf1.predict(X_test)
print("Random Forest:")
print(classification_report(y_test, y_pred_rf1))

conf_mat_rf1 = confusion_matrix(y_test, y_pred_rf1)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_rf1).plot();

#### 5.6 Random Forest Tuning

In [None]:
from scipy.stats import randint

# random forest with hyperparameter tuning

hype_range_rf1 = {'n_estimators' : randint(50,500), 'max_depth' : randint(1,20)}
srch_hype_forest1 = RandomizedSearchCV(rf, param_distributions = hype_range_rf1, n_iter = 30, cv = 5)
srch_hype_forest1.fit(X_train, y_train)
best_RForest1 = srch_hype_forest1.best_estimator_
y_predforest = best_RForest1.predict(X_test)

print("Random Forest After Tuning:")
print(classification_report(y_test, y_predforest))

conf_mat_rf2 = confusion_matrix(y_test, y_predforest)
ConfusionMatrixDisplay(confusion_matrix = conf_mat_rf2).plot();

In [None]:
from sklearn.metrics import classification_report

# save results

report_lr = classification_report(y_test, y_pred_lr, output_dict=True)
report_lr_aft_tuning = classification_report(y_test, y_pred_lr2, output_dict=True)

report_svm = classification_report(y_test, y_pred_svm, output_dict=True)
report_svm_aft_tuning = classification_report(y_test, y_pred_svm2, output_dict=True)

report_rf = classification_report(y_test, y_pred_rf1, output_dict=True)
report_rf_aft_tuning = classification_report(y_test, y_predforest, output_dict=True)

In [None]:
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, accuracy_score

# k-fold cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision_macro': make_scorer(precision_score, average='macro'),
    'recall_macro': make_scorer(recall_score, average='macro'),
    'f1_macro': make_scorer(f1_score, average='macro')
}

cv_results_lr = cross_validate(best_lr, X_train, y_train, cv=cv, scoring=scoring)
cv_results_svm = cross_validate(best_svm, X_train, y_train, cv=cv, scoring=scoring)
cv_results_rf = cross_validate(best_RForest1, X_train, y_train, cv=cv, scoring=scoring)

### Transformers: DistilBERT

In [None]:
pip install transformers datasets scikit-learn torch

In [None]:
print(df.columns)

In [None]:
# convert df to hugging face dataset

from datasets import Dataset

dataset = Dataset.from_pandas(df[['cleaned_review', 'sentiment_encoded']])
dataset = dataset.rename_columns({'cleaned_review': 'text', 'sentiment_encoded': 'label'})

dataset = dataset.train_test_split(test_size=0.2)
print(dataset)

In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["text", "__index_level_0__"])
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format("torch", columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
dataset_split = tokenized_dataset['train'].train_test_split(test_size=0.2, seed=42)

train_dataset = dataset_split['train']
test_dataset = dataset_split['test']

In [None]:
# !! only run once and restart session once !!

!pip install -U datasets

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments

# transformer: DistilBERT

model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=3
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="macro")
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    logging_dir='./logs',
    logging_steps=1,
    report_to='none'
)

# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# train the model
trainer.train()

In [None]:
# evaluate the transformer

trainer.evaluate()

In [None]:
predictions = trainer.predict(test_dataset)

In [None]:
import numpy as np
y_pred_bert = np.argmax(predictions.predictions, axis=1)
y_true_bert = predictions.label_ids
report_bert = classification_report(y_true_bert, y_pred_bert, output_dict=True)

In [None]:
# show results in a table (ml models & transformer)

import pandas as pd

data = {
    "Model": ["Logistic Regression","Logistic Regression Tuned", "SVM", "SVM Tuned", "Random Forest","Random Forest Tuned", "DistilBERT"],
    "Accuracy": [
        report_lr["accuracy"],
        report_lr_aft_tuning["accuracy"],
        report_svm["accuracy"],
        report_svm_aft_tuning["accuracy"],
        report_rf["accuracy"],
        report_rf_aft_tuning["accuracy"],
        report_bert["accuracy"]
    ],
    "Precision": [
        report_lr["macro avg"]["precision"],
        report_lr_aft_tuning["macro avg"]["precision"],
        report_svm["macro avg"]["precision"],
        report_svm_aft_tuning["macro avg"]["precision"],
        report_rf["macro avg"]["precision"],
        report_rf_aft_tuning["macro avg"]["precision"],
        report_bert["macro avg"]["precision"]
    ],
    "Recall": [
        report_lr["macro avg"]["recall"],
        report_lr_aft_tuning["macro avg"]["recall"],
        report_svm["macro avg"]["recall"],
        report_svm_aft_tuning["macro avg"]["recall"],
        report_rf["macro avg"]["recall"],
        report_rf_aft_tuning["macro avg"]["recall"],
        report_bert["macro avg"]["recall"]
    ],
    "F1-Score": [
        report_lr["macro avg"]["f1-score"],
        report_lr_aft_tuning["macro avg"]["f1-score"],
        report_svm["macro avg"]["f1-score"],
        report_svm_aft_tuning["macro avg"]["f1-score"],
        report_rf["macro avg"]["f1-score"],
        report_rf_aft_tuning["macro avg"]["f1-score"],
        report_bert["macro avg"]["f1-score"]
    ]
}

df_performance = pd.DataFrame(data)
df_performance

### ABSA

In [None]:
df.info()

In [None]:
# !! only run once and restart session once !!

!pip install spacy

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import pandas as pd
import spacy
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

nlp = spacy.load("en_core_web_sm")
vader = SentimentIntensityAnalyzer()

# function to extract noun phrases (aspects)
def extract_aspects_and_sentiment(text):
    doc = nlp(text)
    aspects = []
    for sent in doc.sents:
        sentiment = vader.polarity_scores(sent.text)['compound']
        label = 'positive' if sentiment > 0.2 else 'negative' if sentiment < -0.2 else 'neutral'
        for chunk in sent.noun_chunks:
            noun = chunk.text.lower()
            if len(noun.split()) <= 2:
                aspects.append((noun, label))
    return aspects

df['aspects'] = df['cleaned_review'].apply(extract_aspects_and_sentiment)
df[['review', 'aspects','sentiment', 'sentiment_encoded']].head(20)

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt

# flatten aspects
aspect_sentiment_counter = defaultdict(lambda: {'positive': 0, 'negative': 0, 'neutral': 0})

for aspects in df['aspects']:
    for noun, sentiment in aspects:
        aspect_sentiment_counter[noun][sentiment] += 1

aspect_summary = pd.DataFrame(aspect_sentiment_counter).T
aspect_summary = aspect_summary.sort_values(by='positive', ascending=False)
aspect_summary.head(20)

In [None]:
aspect_summary.head(10).plot(kind='bar', stacked=True, figsize=(10,6))
plt.title("Top Aspects and Their Sentiment Distribution")
plt.ylabel("Count")
plt.xlabel("Aspect")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
aspect_df = df.explode('aspects')  # expand list of tuples into rows
aspect_df[['aspect', 'predicted_sentiment']] = pd.DataFrame(aspect_df['aspects'].tolist(), index=aspect_df.index)

In [None]:
from collections import Counter

# count total sentiments

def majority_sentiment(aspects):
    sentiments = [s for a, s in aspects]
    if sentiments:
        return Counter(sentiments).most_common(1)[0][0]
    return 'neutral'  # fallback

df['aspect_majority_sentiment'] = df['aspects'].apply(majority_sentiment)

comparison = pd.crosstab(df['sentiment'], df['aspect_majority_sentiment'])
print(comparison)

In [None]:
df.info()

### Result Visualizations

In [None]:
pip install wordcloud

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from datetime import datetime

In [None]:
# bar chart: model comparison

df_melted = df_performance.melt(id_vars='Model', var_name='Metric', value_name='Score')

plt.figure(figsize=(12, 6))
sns.barplot(data=df_melted, x='Model', y='Score', hue='Metric')
plt.title("Model Performance Comparison")
plt.ylim(0, 1)
plt.xticks(rotation=45)
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

In [None]:
# bar chart: sentiment distribution

plt.figure(figsize=(6,4))
sns.countplot(data=df, x='sentiment', palette={
    'positive': 'green',
    'neutral': 'gray',
    'negative': 'red'
})
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# wordcloud for aspects: positive, neutral & negative

for sentiment in ['positive', 'neutral', 'negative']:
    text = " ".join(df[df['sentiment'] == sentiment]['cleaned_review'])
    wordcloud = WordCloud(width=1000, height=500, background_color='white').generate(text)

    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud - {sentiment.capitalize()} Sentiment')
    plt.show()

In [None]:
# bar chart: sentiment by verified and unverified users

plt.figure(figsize=(8,5))
sns.countplot(data=df, x='sentiment', hue='isVerified', palette='coolwarm')
plt.title('Sentiment by Verified/Unverified Reviewers')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.legend(title='Is Verified')
plt.show()

In [None]:
# bar chart: top 5 countries

top_countries = df['country'].value_counts().nlargest(5).index
df_top_countries = df[df['country'].isin(top_countries)]

plt.figure(figsize=(12,6))
sns.countplot(data=df_top_countries, x='country', hue='sentiment', palette='Set1')
plt.title('Sentiment Distribution by Top 5 Countries')
plt.xlabel('Country')
plt.ylabel('Count')
plt.legend(title='Sentiment')
plt.show()

In [None]:
print(df['color'].unique())

In [None]:
# bar chart: sentiment by color

df_color = df.dropna(subset=['color'])

plt.figure(figsize=(12,6))
sns.countplot(data=df_color, x='color', hue='sentiment',
              palette={'positive': 'green', 'neutral': 'gray', 'negative': 'red'},
              order=df_color['color'].value_counts().index)
plt.title('Sentiment Distribution by Product Color')
plt.xlabel('Color')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.show()

In [None]:
print(df['size'].unique())

In [None]:
# bar chart: sentiment by size

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='size', hue='sentiment',
              palette={'positive': 'green', 'neutral': 'gray', 'negative': 'red'})
plt.title('Sentiment Distribution by Size')
plt.xlabel('Product Size')
plt.ylabel('Number of Reviews')
plt.xticks(rotation=45)
plt.legend(title='Sentiment')
plt.tight_layout()
plt.show()