In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
reviews_df = pd.read_csv('../dataset/air_nz_cleaned_data.csv')

# Check for and handle missing values
reviews_df.dropna(subset=['review_content'], inplace=True)

reviews_df

In [None]:
# Text cleaning function
def clean_text(text):
    # Preserve exclamations, question marks, and retain original casing
    # Remove unwanted characters but keep punctuation marks important for sentiment
    text = re.sub(r'[^a-zA-Z0-9\s!?.]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text cleaning
reviews_df['combined_content'] = reviews_df['header'] + ' ' + reviews_df['review_content'].apply(clean_text)
reviews_df

In [None]:
# Define sentiment analysis functions
def get_sentiment_textblob(text):
    analysis = TextBlob(text)
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 'Positive'
    elif polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

# Apply sentiment analysis using TextBlob
reviews_df['textblob_sentiment'] = reviews_df['combined_content'].apply(get_sentiment_textblob)
reviews_df

In [None]:
# Calculate sentiment counts
sentiment_counts = reviews_df['textblob_sentiment'].value_counts()
sentiment_counts

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(10, 7))
colors = ['#4CAF50', '#FFC107', '#F44336']  # Colors for Positive, Neutral, Negative
plt.bar(sentiment_counts.index, sentiment_counts.values, color=colors, edgecolor='black', alpha=0.8)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xlabel('Sentiment', fontsize=14, fontweight='bold')
plt.ylabel('Number of Reviews', fontsize=14, fontweight='bold')
plt.title('Distribution of Sentiment Categories (TextBlob)', fontsize=16, fontweight='bold')
for i, count in enumerate(sentiment_counts.values):
    plt.text(i, count + 5, str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
def get_sentiment_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    analyzer.lexicon['worst airline'] = -5.0    
    analyzer.lexicon['never fly with Air New Zealand'] = -5.0   

    score = analyzer.polarity_scores(text)['compound']
    if score > 0.1:  
        return 'Positive'
    elif score < -0.1: 
        return 'Negative'
    else:
        return 'Neutral'


In [None]:
# Apply sentiment analysis using VADER

reviews_df['vader_sentiment'] = reviews_df['combined_content'].apply(get_sentiment_vader)
reviews_df

In [None]:
# Calculate sentiment counts
vader_sentiment_counts = reviews_df['vader_sentiment'].value_counts()
vader_sentiment_counts

In [None]:
# Plot sentiment distribution
plt.figure(figsize=(10, 7))
colors = ['#4CAF50', '#FFC107', '#F44336']  # Colors for Positive, Neutral, Negative
plt.bar(vader_sentiment_counts.index, vader_sentiment_counts.values, color=colors, edgecolor='black', alpha=0.8)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.xlabel('Sentiment', fontsize=14, fontweight='bold')
plt.ylabel('Number of Reviews', fontsize=14, fontweight='bold')
plt.title('Distribution of Sentiment Categories (VADER)', fontsize=16, fontweight='bold')
for i, count in enumerate(vader_sentiment_counts.values):
    plt.text(i, count + 5, str(count), ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Define function to map ratings to sentiment
def map_rating_to_sentiment(rating):
    if rating >= 8:
        return 'Positive'
    elif rating >= 4:
        return 'Neutral'
    else:
        return 'Negative'
    
# Apply rating to sentiment mapping
reviews_df['rating_sentiment'] = reviews_df['rating'].apply(map_rating_to_sentiment)
reviews_df

In [None]:
# Sample reviews for each sentiment type
def sample_reviews(df, sentiment, n=2):
    return df[df['textblob_sentiment'] == sentiment].sample(n=n)

# Sample 2 reviews for each sentiment type
positive_samples = sample_reviews(reviews_df, 'Positive')
neutral_samples = sample_reviews(reviews_df, 'Neutral')
negative_samples = sample_reviews(reviews_df, 'Negative')

In [None]:
# Combine samples into one DataFrame for easier inspection
sampled_reviews = pd.concat([positive_samples, neutral_samples, negative_samples])

# Display sampled reviews with TextBlob and VADER sentiment labels
print("Sampled Reviews for Sentiment Analysis:")
print(sampled_reviews[['header', 'review_content', 'textblob_sentiment', 'vader_sentiment']])

# Save results to CSV for inspection
reviews_df[['header', 'review_content', 'rating', 'rating_sentiment', 'vader_sentiment', 'textblob_sentiment']].to_csv('../dataset/sentiment_comparison.csv', index=False)
print("Results saved to 'sentiment_comparison.csv'.")

In [None]:
# Calculate confusion matrix and classification report for VADER
y_true = reviews_df['rating_sentiment']
y_pred_vader = reviews_df['vader_sentiment']

conf_matrix_vader = confusion_matrix(y_true, y_pred_vader, labels=['Positive', 'Neutral', 'Negative'])
class_report_vader = classification_report(y_true, y_pred_vader, labels=['Positive', 'Neutral', 'Negative'])

print("Confusion Matrix for VADER:")
print(conf_matrix_vader)
print("\nClassification Report for VADER:")
print(class_report_vader)

In [None]:
# Calculate confusion matrix and classification report for TextBlob
y_true = reviews_df['rating_sentiment']
y_pred_textblob = reviews_df['textblob_sentiment']

conf_matrix_textblob = confusion_matrix(y_true, y_pred_textblob, labels=['Positive', 'Neutral', 'Negative'])
class_report_textblob = classification_report(y_true, y_pred_textblob, labels=['Positive', 'Neutral', 'Negative'])

print("Confusion Matrix for TextBlob:")
print(conf_matrix_textblob)
print("\nClassification Report for TextBlob:")
print(class_report_textblob)

In [None]:
# Save results to CSV for inspection
reviews_df[['review_content', 'rating', 'rating_sentiment', 'vader_sentiment', 'textblob_sentiment']].to_csv('../dataset/sentiment_comparison.csv', index=False)
print("Results saved to 'sentiment_comparison.csv'.")

# Plot confusion matrices
def plot_confusion_matrix(cm, labels, title):
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Sentiment', fontsize=14, fontweight='bold')
    plt.ylabel('True Sentiment', fontsize=14, fontweight='bold')
    plt.title(title, fontsize=16, fontweight='bold')
    plt.show()

plot_confusion_matrix(conf_matrix_vader, labels=['Positive', 'Neutral', 'Negative'], title='Confusion Matrix for VADER')
plot_confusion_matrix(conf_matrix_textblob, labels=['Positive', 'Neutral', 'Negative'], title='Confusion Matrix for TextBlob')

In [None]:
# Calculate accuracies
def calculate_accuracy(df, sentiment_col):
    return (df['rating_sentiment'] == df[sentiment_col]).mean()

vader_accuracy = calculate_accuracy(reviews_df, 'vader_sentiment')
textblob_accuracy = calculate_accuracy(reviews_df, 'textblob_sentiment')

print(f"VADER Accuracy: {vader_accuracy:.2f}")
print(f"TextBlob Accuracy: {textblob_accuracy:.2f}")


In [None]:
accuracy_df = pd.DataFrame({
    'Tool': ['VADER', 'TextBlob'],
    'Accuracy': [vader_accuracy, textblob_accuracy]
})


plt.figure(figsize=(8, 5))
sns.barplot(x='Tool', y='Accuracy', data=accuracy_df, palette='viridis')
plt.ylim(0, 1)  
plt.xlabel('Sentiment Analysis Tool', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy', fontsize=14, fontweight='bold')
plt.title('Accuracy of Sentiment Analysis Tools', fontsize=16, fontweight='bold')
plt.grid(axis='y', linestyle='--', alpha=0.7)


for index, value in enumerate(accuracy_df['Accuracy']):
    plt.text(index, value + 0.02, f'{value:.2f}', ha='center', va='bottom', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
# Convert 'date' to datetime and extract the year
reviews_df['year'] = pd.to_datetime(reviews_df['date'], format='%Y-%m-%d').dt.year

# Number of reviews aggregated by year and sentiment
sentiment_by_year = reviews_df.groupby(['year', 'vader_sentiment']).size().unstack(fill_value=0)

sentiment_by_year