### Supervised Machine Learning Models

# select tfidf settings

In [2]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

delimiter = ','

# Read the data file
try:
    yelp_data = pd.read_csv('stratified_sample_4.csv', delimiter=delimiter)
except pd.errors.ParserError as e:
    print(f"Error reading the data file: {e}")

X = yelp_data['text']
aspects = ['Food Quality', 'Customer Service', 'Place', 'Menu_and_Pricing', 'Drinks', 'Time']
yelp_data[aspects] = yelp_data[aspects].fillna(0)

# Define different TF-IDF settings to try
tfidf_settings_list = [
    {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False},
    {'max_df': 0.75, 'min_df': 3, 'norm': 'l1', 'sublinear_tf': False},
    {'max_df': 0.9, 'min_df': 1, 'norm': 'l2', 'sublinear_tf': True},
    {'max_df': 0.2, 'norm': 'l2', 'sublinear_tf': True}
]

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier()
}

# Create an empty list to store results
results_list = []

# Number of runs
num_runs = 5

# Perform the process multiple times
for run in range(num_runs):
    print(f"Run {run + 1} of {num_runs}")

    # Iterate over each aspect
    for aspect in aspects:
        y = yelp_data[aspect]

        for tfidf_params in tfidf_settings_list:
            # Create a TF-IDF vectorizer with the specified parameters
            tfidf_vectorizer = TfidfVectorizer(**tfidf_params)

            # Transform the text data
            X_tfidf = tfidf_vectorizer.fit_transform(X)

            for clf_name, clf in classifiers.items():
                # Oversample the minority class using RandomOverSampler
                oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
                X_resampled, y_resampled = oversampler.fit_resample(X_tfidf, y)

                # Split the resampled data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

                # Combine classes 1 and 0 as the positive class
                y_train_combined = (y_train == 1) | (y_train == 0)
                y_test_combined = (y_test == 1) | (y_test == 0)

                X_train_combined = X_train[y_train_combined]
                y_train_combined = y_train[y_train_combined]
                X_test_combined = X_test[y_test_combined]
                y_test_combined = y_test[y_test_combined]

                # Train the classifier
                clf.fit(X_train_combined, y_train_combined)

                # Calculate AUC-ROC for the aspect by combining classes 1 and 0 as the positive class
                try:
                    roc_auc = roc_auc_score(y_test_combined, clf.predict_proba(X_test_combined)[:, 1])  # Assuming positive class is 1
                except ValueError:
                    roc_auc = 0.0  # Set to 0 if there is only one class in y_test for the current aspect

                # Store the results in the list
                results_list.append({
                    'Aspect': aspect,
                    'TF-IDF Settings': tfidf_params,
                    'Classifier': clf_name,
                    'AUC-ROC': roc_auc
                })

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results_list)

results_df['TF-IDF Settings String'] = results_df['TF-IDF Settings'].apply(str)
mean_auc_roc_by_settings = results_df.groupby(['TF-IDF Settings String'])['AUC-ROC'].mean().reset_index()
mean_auc_roc_by_settings

Run 1 of 5
Run 2 of 5
Run 3 of 5
Run 4 of 5
Run 5 of 5


Unnamed: 0,TF-IDF Settings String,AUC-ROC
0,"{'max_df': 0.2, 'norm': 'l2', 'sublinear_tf': True}",0.785641
1,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",0.802064
2,"{'max_df': 0.75, 'min_df': 3, 'norm': 'l1', 'sublinear_tf': False}",0.781374
3,"{'max_df': 0.9, 'min_df': 1, 'norm': 'l2', 'sublinear_tf': True}",0.789161


In [3]:
from scipy import stats

# Define the AUC-ROC scores for each setting
auc_roc_scores_1 = results_df[results_df['TF-IDF Settings'] == {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}]['AUC-ROC']
auc_roc_scores_2 = results_df[results_df['TF-IDF Settings'] == {'max_df': 0.75, 'min_df': 3, 'norm': 'l1', 'sublinear_tf': False}]['AUC-ROC']
auc_roc_scores_3 = results_df[results_df['TF-IDF Settings'] == {'max_df': 0.9, 'min_df': 1, 'norm': 'l2', 'sublinear_tf': True}]['AUC-ROC']

# Perform paired t-test between auc_roc_scores_1 and auc_roc_scores_2
t_stat_1_2, p_value_1_2 = stats.ttest_rel(auc_roc_scores_1, auc_roc_scores_2, alternative='greater')

# Perform paired t-test between auc_roc_scores_1 and auc_roc_scores_3
t_stat_1_3, p_value_1_3 = stats.ttest_rel(auc_roc_scores_1, auc_roc_scores_3, alternative='greater')

# Perform paired t-test between auc_roc_scores_2 and auc_roc_scores_3
t_stat_2_3, p_value_2_3 = stats.ttest_rel(auc_roc_scores_2, auc_roc_scores_3, alternative='greater')

# Print the t-statistics and p-values
print(f'T-statistic (Setting 1 vs. Setting 2): {t_stat_1_2}, p-value: {p_value_1_2}')
print(f'T-statistic (Setting 1 vs. Setting 3): {t_stat_1_3}, p-value: {p_value_1_3}')
print(f'T-statistic (Setting 2 vs. Setting 3): {t_stat_2_3}, p-value: {p_value_2_3}')

T-statistic (Setting 1 vs. Setting 2): 5.092012885346961, p-value: 9.818216434457722e-07
T-statistic (Setting 1 vs. Setting 3): 3.7550997031833333, p-value: 0.00015435030388943587
T-statistic (Setting 2 vs. Setting 3): -1.4871829923069535, p-value: 0.9297495769358587


Based on t-test, we choose,

{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}

as the tfidf setting.

# select best model for each aspect

In [110]:
import warnings
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import RandomOverSampler

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

delimiter = ','

# Read the data file
try:
    yelp_data = pd.read_csv('stratified_sample_4.csv', delimiter=delimiter)
except pd.errors.ParserError as e:
    print(f"Error reading the data file: {e}")

X = yelp_data['text']
aspects = ['Food Quality', 'Customer Service', 'Place', 'Menu_and_Pricing', 'Drinks', 'Time']
yelp_data[aspects] = yelp_data[aspects].fillna(0)

# Define different TF-IDF settings to try
tfidf_settings_list = [
    {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}
]

# Define classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Bagging': BaggingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier()
}

# Create an empty list to store results
results_list = []

# Number of runs
num_runs = 5

# Perform the process multiple times
for run in range(num_runs):
    print(f"Run {run + 1} of {num_runs}")

    # Iterate over each aspect
    for aspect in aspects:
        y = yelp_data[aspect]

        for tfidf_params in tfidf_settings_list:
            # Create a TF-IDF vectorizer with the specified parameters
            tfidf_vectorizer = TfidfVectorizer(**tfidf_params)

            # Transform the text data
            X_tfidf = tfidf_vectorizer.fit_transform(X)

            for clf_name, clf in classifiers.items():
                # Oversample the minority class using RandomOverSampler
                oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
                X_resampled, y_resampled = oversampler.fit_resample(X_tfidf, y)

                # Split the resampled data into training and testing sets
                X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

                # Combine classes 1 and 0 as the positive class
                y_train_combined = (y_train == 1) | (y_train == 0)
                y_test_combined = (y_test == 1) | (y_test == 0)

                X_train_combined = X_train[y_train_combined]
                y_train_combined = y_train[y_train_combined]
                X_test_combined = X_test[y_test_combined]
                y_test_combined = y_test[y_test_combined]

                # Train the classifier
                clf.fit(X_train_combined, y_train_combined)

                # Calculate AUC-ROC for the aspect by combining classes 1 and 0 as the positive class
                try:
                    roc_auc = roc_auc_score(y_test_combined, clf.predict_proba(X_test_combined)[:, 1])  # Assuming positive class is 1
                except ValueError:
                    roc_auc = 0.0  # Set to 0 if there is only one class in y_test for the current aspect

                # Store the results in the list
                results_list.append({
                    'Aspect': aspect,
                    'TF-IDF Settings': tfidf_params,
                    'Classifier': clf_name,
                    'AUC-ROC': roc_auc
                })

# Create a DataFrame from the list of results
results_df = pd.DataFrame(results_list)


Run 1 of 5
Run 2 of 5
Run 3 of 5
Run 4 of 5
Run 5 of 5


In [111]:
best_auc_roc_per_aspect = results_df.groupby('Aspect')['AUC-ROC'].idxmax()
best_auc_roc_rows = results_df.loc[best_auc_roc_per_aspect]
best_auc_roc_rows

Unnamed: 0,Aspect,TF-IDF Settings,Classifier,AUC-ROC
10,Customer Service,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Random Forest,0.925172
178,Drinks,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Random Forest,0.886782
0,Food Quality,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Logistic Regression,0.885238
28,Menu_and_Pricing,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Gradient Boosting,0.847403
18,Place,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Random Forest,0.796332
42,Time,"{'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}",Random Forest,1.0


In [127]:
from sklearn.metrics import classification_report

# Define a function to display F1 score report
def display_f1_score_report(X, y, tfidf_params, clf_name, clf):
    # Create a TF-IDF vectorizer with the specified parameters
    tfidf_vectorizer = TfidfVectorizer(**tfidf_params)

    # Transform the text data
    X_tfidf = tfidf_vectorizer.fit_transform(X)

    # Oversample the minority class using RandomOverSampler
    oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X_tfidf, y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

    # Train the classifier
    clf.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = clf.predict(X_test)

    # Display the classification report
    report = classification_report(y_test, y_pred)
    print(f"Aspect: {aspect}")
    print(f"TF-IDF Settings: {tfidf_params}")
    print(f"Classifier: {clf_name}")
    print(report)
    print("=" * 50)

# Iterate over the specified best models and aspects
best_models = [
    {'Aspect': 'Customer Service', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Random Forest'},
    {'Aspect': 'Drinks', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Random Forest'},
    {'Aspect': 'Food Quality', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Logistic Regression'},
    {'Aspect': 'Menu_and_Pricing', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Gradient Boosting'},
    {'Aspect': 'Place', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Random Forest'},
    {'Aspect': 'Time', 'TF-IDF Settings': {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}, 'Classifier': 'Random Forest'}
]

for best_model in best_models:
    aspect = best_model['Aspect']
    tfidf_params = best_model['TF-IDF Settings']
    clf_name = best_model['Classifier']
    clf = classifiers[clf_name]  # Get the classifier object from the dictionary

    # Call the display_f1_score_report function to display the report
    display_f1_score_report(X, yelp_data[aspect], tfidf_params, clf_name, clf)


Aspect: Customer Service
TF-IDF Settings: {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}
Classifier: Random Forest
              precision    recall  f1-score   support

          -1       0.95      0.98      0.97        61
           0       0.63      0.91      0.75        44
           1       0.91      0.55      0.68        53

    accuracy                           0.82       158
   macro avg       0.83      0.81      0.80       158
weighted avg       0.85      0.82      0.81       158

Aspect: Drinks
TF-IDF Settings: {'max_df': 0.5, 'min_df': 2, 'norm': 'l2', 'sublinear_tf': False}
Classifier: Random Forest
              precision    recall  f1-score   support

          -1       1.00      1.00      1.00        88
           0       0.89      1.00      0.94       116
           1       0.00      0.00      0.00        15

    accuracy                           0.93       219
   macro avg       0.63      0.67      0.65       219
weighted avg       0.87      0.93  

tfidf model

In [130]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler

delimiter = ','

# Read the data file
try:
    yelp_data = pd.read_csv('stratified_sample_4.csv', delimiter=delimiter)
except pd.errors.ParserError as e:
    print(f"Error reading the data file: {e}")

X = yelp_data['text']
aspects = ['Food Quality', 'Customer Service', 'Place', 'Menu_and_Pricing', 'Drinks', 'Time']
yelp_data[aspects] = yelp_data[aspects].fillna(0)

# List of classifiers for ensemble learning
classifiers = [
    ('nb', MultinomialNB()),  # You can add more classifiers here
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('lr', LogisticRegression()),
    ('svm', SVC(probability=True)),
    ('dt', DecisionTreeClassifier())
]

# Iterate over each aspect and build a model
for aspect in aspects:
    y = yelp_data[aspect]

    # Oversample the minority class using RandomOverSampler
    oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X.values.reshape(-1, 1), y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled.flatten(), y_resampled, test_size=0.2, random_state=42)

    # Create a pipeline with TF-IDF vectorizer and the ensemble of classifiers
    text_clf = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', VotingClassifier(estimators=classifiers, voting='soft'))
    ])

    # Train the model
    text_clf.fit(X_train, y_train)

    # Make predictions
    predictions = text_clf.predict(X_test)

    # Evaluate the model
    print(f"\nClassification Report for {aspect}:")
    print(classification_report(y_test, predictions))
    print(f"Accuracy for {aspect}: {accuracy_score(y_test, predictions)}")


Classification Report for Food Quality:
              precision    recall  f1-score   support

          -1       0.89      0.99      0.94        83
           0       1.00      0.12      0.21        25
           1       0.82      0.95      0.88        84

    accuracy                           0.86       192
   macro avg       0.91      0.69      0.68       192
weighted avg       0.88      0.86      0.82       192

Accuracy for Food Quality: 0.859375

Classification Report for Customer Service:
              precision    recall  f1-score   support

          -1       0.90      0.98      0.94        61
           0       0.76      0.89      0.82        44
           1       0.90      0.68      0.77        53

    accuracy                           0.85       158
   macro avg       0.85      0.85      0.84       158
weighted avg       0.86      0.85      0.85       158

Accuracy for Customer Service: 0.8544303797468354

Classification Report for Place:
              precision    recal

incidence matrix

In [131]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import CountVectorizer

delimiter = ','

# Read the data file
try:
    yelp_data = pd.read_csv('stratified_sample_4.csv', delimiter=delimiter)
except pd.errors.ParserError as e:
    print(f"Error reading the data file: {e}")

X = yelp_data['text']
aspects = ['Food Quality', 'Customer Service', 'Place', 'Menu_and_Pricing', 'Drinks', 'Time']
yelp_data[aspects] = yelp_data[aspects].fillna(0)

# List of classifiers for ensemble learning
classifiers = [
    ('nb', MultinomialNB()),  # You can add more classifiers here
    ('rf', RandomForestClassifier(random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)),
    ('lr', LogisticRegression()),
    ('svm', SVC(probability=True)),
    ('dt', DecisionTreeClassifier())
]

# Iterate over each aspect and build a model
for aspect in aspects:
    y = yelp_data[aspect]

    # Oversample the minority class using RandomOverSampler
    oversampler = RandomOverSampler(sampling_strategy='minority', random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X.values.reshape(-1, 1), y)

    # Split the resampled data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X_resampled.flatten(), y_resampled, test_size=0.2, random_state=42)

    # Create a pipeline with CountVectorizer (incidence matrix) and the ensemble of classifiers
    text_clf = Pipeline([
        ('count_vectorizer', CountVectorizer(binary=True)),
        ('clf', VotingClassifier(estimators=classifiers, voting='soft'))
    ])

    # Train the model
    text_clf.fit(X_train, y_train)

    # Make predictions
    predictions = text_clf.predict(X_test)

    # Evaluate the model
    print(f"\nClassification Report for {aspect}:")
    print(classification_report(y_test, predictions))
    print(f"Accuracy for {aspect}: {accuracy_score(y_test, predictions)}")



Classification Report for Food Quality:
              precision    recall  f1-score   support

          -1       0.92      0.99      0.95        83
           0       1.00      0.12      0.21        25
           1       0.84      1.00      0.91        84

    accuracy                           0.88       192
   macro avg       0.92      0.70      0.69       192
weighted avg       0.90      0.88      0.84       192

Accuracy for Food Quality: 0.8802083333333334

Classification Report for Customer Service:
              precision    recall  f1-score   support

          -1       0.95      0.97      0.96        61
           0       0.72      0.95      0.82        44
           1       0.92      0.66      0.77        53

    accuracy                           0.86       158
   macro avg       0.87      0.86      0.85       158
weighted avg       0.88      0.86      0.86       158

Accuracy for Customer Service: 0.8607594936708861

Classification Report for Place:
              precisio

In [133]:
yelp_data = pd.read_csv('final_df_3040_labels.csv', delimiter=delimiter)

In [134]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Assuming yelp_data is your DataFrame
yelp_data = yelp_data.dropna(subset=aspects)  # Drop rows with NaN in any of the aspect columns

# Split your data into training and testing sets
train_data, test_data = train_test_split(yelp_data, test_size=0.2, random_state=42)

# Dictionary to store random classifier accuracies for each aspect
random_classifier_accuracies = {}

for aspect in aspects:
    # Calculate probabilities based on training data
    sentiment_counts = train_data[aspect].value_counts(normalize=True)
    
    # Generating random predictions based on these probabilities for the test set
    random_predictions = np.random.choice(sentiment_counts.index, 
                                          size=len(test_data), 
                                          p=sentiment_counts.values)
    
    # Filter out NaN values from test_data for accurate comparison
    test_aspect_data = test_data[aspect].dropna()

    # Calculating accuracy
    accuracy_random = accuracy_score(test_aspect_data, random_predictions[:len(test_aspect_data)])
    random_classifier_accuracies[aspect] = accuracy_random

# Displaying the accuracies
for aspect, accuracy in random_classifier_accuracies.items():
    print(f"Accuracy of Random Classifier for {aspect}: {accuracy * 100:.2f}%")

Accuracy of Random Classifier for Food Quality: 53.87%
Accuracy of Random Classifier for Customer Service: 39.21%
Accuracy of Random Classifier for Place: 53.05%
Accuracy of Random Classifier for Menu_and_Pricing: 53.87%
Accuracy of Random Classifier for Drinks: 63.10%
Accuracy of Random Classifier for Time: 67.87%
