In [1]:
#Import libraries

#basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
plt.style.use('ggplot')

#nlp
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
# !pip3 install emoji
import emoji
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('punkt')
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
# nltk.download('averaged_perceptron_tagger')
from nltk.corpus import treebank_chunk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

#ml
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import classification_report

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
import optuna

# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('airlines_reviews.csv')
display(df.head(1), df.shape)


Unnamed: 0,Title,Name,Review Date,Airline,Verified,Reviews,Type of Traveller,Month Flown,Route,Class,Seat Comfort,Staff Service,Food & Beverages,Inflight Entertainment,Value For Money,Overall Rating,Recommended
0,Flight was amazing,Alison Soetantyo,2024-03-01,Singapore Airlines,True,Flight was amazing. The crew onboard this fl...,Solo Leisure,December 2023,Jakarta to Singapore,Business Class,4,4,4,4,4,9,yes


(8100, 17)

### 1. Preprocessing function

In [3]:

# Define the preprocessing function
def preprocess_reviews(text):
    # Convert text to lowercase
    text = text.astype(str).str.lower()
    
    # Remove emojis
    text = text.apply(lambda x: emoji.demojize(x))
    text = text.str.replace(r':[a-z_]+:', ' ', regex=True)
    
    # Remove special characters and numbers
    text = text.str.replace(r'[^a-zA-Z\s]', '', regex=True)
    
    # Tokenization (split the text into sentences)
    sentences = text.apply(lambda x: sent_tokenize(x))
    
    # Flatten list of sentences
    sentences = sentences.explode()
    
    # Tokenize sentences into words
    words = sentences.apply(lambda x: word_tokenize(x))
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = words.apply(lambda x: [word for word in x if word not in stop_words])
    
    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = words.apply(lambda x: [stemmer.stem(word) for word in x])
    
    # Join the words back into a single string
    preprocessed_text = stemmed_words.apply(lambda x: ' '.join(x))
    
    return preprocessed_text

### 2. Random forest hyperparamter optimisation with peanlty on Neutral class misclassification

In [4]:
# defining train and test sets
X = df['Reviews']
y = df['Overall Rating'].apply(lambda x: 0 if x <= 4 else 1 if x <= 6 else 2)

print(y.value_counts())

# Preprocess text data
X_preprocessed = preprocess_reviews(X)

# import preprocessing
# X_preprocessed = preprocessing.preprocess_reviews(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Vectorize preprocessed text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


# Hyperparameter optimization using Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    
    rf_model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight={0: 1, 1: 5, 2: 1},
        random_state=42
    )
    
    val_scores = cross_val_score(rf_model, X_train_tfidf, y_train, cv=10, scoring='f1_weighted')
    return val_scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=25)

# Train Random Forest model with best parameters
best_params = study.best_params
rf_model = RandomForestClassifier(**best_params, class_weight={0: 1, 1: 5, 2: 1}, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

# Perform 10-fold cross-validation
val_scores = cross_val_score(rf_model, X_train_tfidf, y_train, cv=10, scoring='f1_weighted')

# Print the mean accuracy and standard deviation
print("Mean val f1 score:", val_scores.mean())
print("Val Standard Deviation:", val_scores.std())

# Predict ratings
y_pred = rf_model.predict(X_test_tfidf)

# Print classification report
print(classification_report(y_test, y_pred))

Overall Rating
2    3915
0    3403
1     782
Name: count, dtype: int64


[I 2024-05-17 18:54:07,286] A new study created in memory with name: no-name-6ca63dfc-8350-41fb-ac3f-75eb73b503e7
[I 2024-05-17 18:54:14,807] Trial 0 finished with value: 0.7495291787347294 and parameters: {'n_estimators': 287, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.7495291787347294.
[I 2024-05-17 18:54:28,271] Trial 1 finished with value: 0.7747697319125008 and parameters: {'n_estimators': 154, 'max_depth': 18, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.7747697319125008.
[I 2024-05-17 18:54:36,189] Trial 2 finished with value: 0.7631674665071061 and parameters: {'n_estimators': 232, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 8}. Best is trial 1 with value: 0.7747697319125008.
[I 2024-05-17 18:54:43,217] Trial 3 finished with value: 0.7469284197764003 and parameters: {'n_estimators': 278, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 1 with value:

Mean val f1 score: 0.7818920549641175
Val Standard Deviation: 0.016002173986005133
              precision    recall  f1-score   support

           0       0.86      0.77      0.81       666
           1       0.28      0.21      0.24       162
           2       0.81      0.92      0.87       792

    accuracy                           0.79      1620
   macro avg       0.65      0.63      0.64      1620
weighted avg       0.78      0.79      0.78      1620



### 3.0 Logistic Regression  hyperparamter optimisation 

In [5]:
# defining train and test sets
X = df['Reviews']
y = df['Overall Rating'].apply(lambda x: 0 if x <= 4 else 1 if x <= 6 else 2)

print(y.value_counts())

# Preprocess text data
X_preprocessed = preprocess_reviews(X)

# import preprocessing
# X_preprocessed = preprocessing.preprocess_reviews(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Vectorize preprocessed text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define objective function for Optuna
def objective(trial):
    # Define hyperparameters to optimize
    penalty = trial.suggest_categorical('penalty', ['l2'])
    C = trial.suggest_loguniform('C', 0.001, 10)  
    max_iter = trial.suggest_int('max_iter', 100, 1000)  
    
    # Define class weights with a higher penalty on class 1
    class_weight = {0: 1, 1: 10, 2: 0.5}
    
    # Train Logistic Regression model with hyperparameters
    lr_model = LogisticRegression(multi_class='ovr', solver='lbfgs', penalty=penalty, C=C, max_iter=max_iter,
                                  class_weight=class_weight, random_state=42)
    val_scores = cross_val_score(lr_model, X_train_tfidf, y_train, cv=5, scoring='f1_weighted')
    
    # Return mean validation accuracy as the objective value to maximize
    return val_scores.mean()

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params
print(best_params)

# Train Logistic Regression model with best hyperparameters
class_weight = {0: 1, 1: 10, 2: 0.5}  # Penalty on class 1
lr_model = LogisticRegression(multi_class='ovr', solver='lbfgs', class_weight=class_weight, **best_params, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

# Predict ratings
y_pred = lr_model.predict(X_test_tfidf)

# Print classification report
print(classification_report(y_test, y_pred))

Overall Rating
2    3915
0    3403
1     782
Name: count, dtype: int64


[I 2024-05-17 18:59:16,515] A new study created in memory with name: no-name-422801a8-b7e4-4ed7-93c7-f72827e53c63
[I 2024-05-17 18:59:16,639] Trial 0 finished with value: 0.03574985724119638 and parameters: {'penalty': 'l2', 'C': 0.016264392803264763, 'max_iter': 147}. Best is trial 0 with value: 0.03574985724119638.
[I 2024-05-17 18:59:16,876] Trial 1 finished with value: 0.4244202585465282 and parameters: {'penalty': 'l2', 'C': 0.14269539978474027, 'max_iter': 119}. Best is trial 1 with value: 0.4244202585465282.
[I 2024-05-17 18:59:17,001] Trial 2 finished with value: 0.07718483976528194 and parameters: {'penalty': 'l2', 'C': 0.019989051581241108, 'max_iter': 885}. Best is trial 1 with value: 0.4244202585465282.
[I 2024-05-17 18:59:17,461] Trial 3 finished with value: 0.7023213767572968 and parameters: {'penalty': 'l2', 'C': 1.1816642975456038, 'max_iter': 212}. Best is trial 3 with value: 0.7023213767572968.
[I 2024-05-17 18:59:17,762] Trial 4 finished with value: 0.625484112581894

{'penalty': 'l2', 'C': 9.694707804018677, 'max_iter': 734}
              precision    recall  f1-score   support

           0       0.85      0.74      0.79       666
           1       0.20      0.54      0.29       162
           2       0.92      0.69      0.79       792

    accuracy                           0.70      1620
   macro avg       0.66      0.66      0.62      1620
weighted avg       0.82      0.70      0.74      1620

