In [5]:
import pandas as pd

# Loading the dataset
my_df = pd.read_csv("Reviews.csv")

# Displaying the first 5 rows of the DataFrame
print(my_df.head())


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [6]:
my_df.shape

(568454, 10)

In [7]:
from sklearn.model_selection import train_test_split

# mapping the ratings to labels
my_df['reviews'] = my_df['Score'].apply(lambda x: 'Negative' if x in [1, 2] else 'Neutral' if x == 3 else 'Positive')

# splitting the dataset into training, validation, and test sets
train_val_my_df, test_my_df = train_test_split(my_df, test_size=0.2, random_state=42)
train_my_df, val_my_df = train_test_split(train_val_my_df, test_size=0.25, random_state=42)

# calculate label counts for each split
train_label_counts = train_my_df['reviews'].value_counts()
val_label_counts = val_my_df['reviews'].value_counts()
test_label_counts = test_my_df['reviews'].value_counts()

# creating table of label counts for each split
counts_table = pd.concat([train_label_counts, val_label_counts, test_label_counts], axis=1, sort=False)
counts_table.columns = ['Train', 'Validation', 'Test']
counts_table.index.name = 'Label'
print(counts_table) 

# Checking label distribution across splits
print('Training Set Label Distribution:', train_label_counts/len(train_my_df))
print('Validation Set Label Distribution:', val_label_counts/len(val_my_df))
print('Test Set Label Distribution:', test_label_counts/len(test_my_df))


           Train  Validation   Test
Label                              
Positive  266240       88512  89025
Negative   49278       16578  16181
Neutral    25554        8601   8485
Training Set Label Distribution: Positive    0.780598
Negative    0.144480
Neutral     0.074923
Name: reviews, dtype: float64
Validation Set Label Distribution: Positive    0.778531
Negative    0.145816
Neutral     0.075652
Name: reviews, dtype: float64
Test Set Label Distribution: Positive    0.783044
Negative    0.142324
Neutral     0.074632
Name: reviews, dtype: float64


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Step 0: Vectorise text
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(my_df['Text'])

# Step 1: Pick k random "centroids"
k = 5
my_kmeans = KMeans(n_clusters=k, init='random', max_iter=100, n_init=1, random_state=42)

# Step 2 and 3: Assign each vector to its closest centroid and recalculate the centroids based on the closest vectors
my_kmeans.fit(X)



def get_top_terms(cluster_centroid, feature_names_out, n=10):
    # Get the top n terms for a given centroid
    sort_terms = cluster_centroid.argsort()[::-1]
    return [feature_names_out[i] for i in sort_terms[:n]]



# Print the cluster assignments and the top terms in each cluster
cluster_labels = my_kmeans.labels_
cluster_centroids = my_kmeans.cluster_centers_



for i in range(k):
    print("Cluster %d:" % i)
    cluster_documents = [doc for j, doc in enumerate(my_df) if cluster_labels[j] == i]
    print("Number of documents:", len(cluster_documents))
    top_term_cluster = get_top_terms(cluster_centroids[i], vectorizer.get_feature_names_out(), n=10)
    print("Top terms:", top_term_cluster)



Cluster 0:
Number of documents: 0
Top terms: ['these', 'they', 'are', 'the', 'them', 'and', 'to', 'of', 'for', 'in']
Cluster 1:
Number of documents: 2
Top terms: ['food', 'she', 'the', 'dog', 'he', 'and', 'to', 'it', 'her', 'my']
Cluster 2:
Number of documents: 0
Top terms: ['coffee', 'the', 'it', 'this', 'is', 'and', 'cup', 'of', 'to', 'for']
Cluster 3:
Number of documents: 1
Top terms: ['br', 'the', 'it', 'and', 'to', 'of', 'is', 'you', 'this', 'in']
Cluster 4:
Number of documents: 8
Top terms: ['the', 'it', 'and', 'this', 'is', 'to', 'of', 'for', 'in', 'tea']


In [9]:
for i in range(k):
    print("Clusters %d:" % i)
    cluster_documents = [doc for j, doc in enumerate(my_df) if cluster_labels[j] == i]
    print("Number of docs:", len(cluster_documents))
    print("Sample docs:")
    for doc in cluster_documents[:5]:
        print(" -", doc)
    top_term_cluster = get_top_terms(cluster_centroids[i], vectorizer.get_feature_names_out(), n=5)
    print("Top terms:", top_term_cluster)


Clusters 0:
Number of docs: 0
Sample docs:
Top terms: ['these', 'they', 'are', 'the', 'them']
Clusters 1:
Number of docs: 2
Sample docs:
 - Id
 - Text
Top terms: ['food', 'she', 'the', 'dog', 'he']
Clusters 2:
Number of docs: 0
Sample docs:
Top terms: ['coffee', 'the', 'it', 'this', 'is']
Clusters 3:
Number of docs: 1
Sample docs:
 - reviews
Top terms: ['br', 'the', 'it', 'and', 'to']
Clusters 4:
Number of docs: 8
Sample docs:
 - ProductId
 - UserId
 - ProfileName
 - HelpfulnessNumerator
 - HelpfulnessDenominator
Top terms: ['the', 'it', 'and', 'this', 'is']


In [10]:
import pandas as pd

# Creating a new DataFrame with cluster assignments and corresponding true labels
my_cluster_df = pd.DataFrame({'Cluster': cluster_labels, 'True_Label': my_df['reviews']})

# Constructing the confusion matrix using pd.crosstab()
my_confusion_matrix = pd.crosstab(my_cluster_df['Cluster'], my_cluster_df['True_Label'])

# Printing the confusion matrix
print(my_confusion_matrix)

True_Label  Negative  Neutral  Positive
Cluster                                
0              13091     7093     91863
1               6642     3125     43623
2               6304     4459     38576
3              11798     7588     53238
4              44202    20375    216477


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Baseline 1: Dummy Classifier with strategy="most_frequent"
dummy_most_freq = DummyClassifier(strategy="most_frequent")
dummy_most_freq.fit(X_train_set, y_train_set)
y_pred_dummy_most_freq = dummy_most_freq.predict(X_test_set)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
print("Baseline 1: Dummy Classifier with strategy=\"most_frequent\"")
print("Accuracy:", accuracy_score(y_test_set, y_pred_dummy_most_freq))
print("Precision:", precision_score(y_test_set, y_pred_dummy_most_freq, average='macro', zero_division=0))
print("Recall:", recall_score(y_test_set, y_pred_dummy_most_freq, average='macro', zero_division=0))
print("F1-score:", f1_score(y_test_set, y_pred_dummy_most_freq, average='macro', zero_division=0))


Baseline 1: Dummy Classifier with strategy="most_frequent"
Accuracy: 0.7830435126791039
Precision: 0.26101450422636796
Recall: 0.3333333333333333
F1-score: 0.2927741273505791


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Baseline 2: Dummy Classifier with strategy="stratified"
dummy_strat = DummyClassifier(strategy="stratified")
dummy_strat.fit(X_train_set, y_train_set)
y_pred_dummy_strat = dummy_strat.predict(X_test_set)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
print("Baseline 2: Dummy Classifier with strategy=\"stratified\"")
print("Accuracy:", accuracy_score(y_test_set, y_pred_dummy_strat))
print("Precision:", precision_score(y_test_set, y_pred_dummy_strat, average='macro'))
print("Recall:", recall_score(y_test_set, y_pred_dummy_strat, average='macro'))
print("F1-score:", f1_score(y_test_set, y_pred_dummy_strat, average='macro'))


Baseline 2: Dummy Classifier with strategy="stratified"
Accuracy: 0.6354768627244021
Precision: 0.3330945732715712
Recall: 0.3331060757768482
F1-score: 0.3330901035831019


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the 'Text' column to strings
my_df['Text'] = my_df['Text'].astype(str)

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Transform the text data into a one-hot encoded matrix
vectorizer = CountVectorizer(binary=True)
X_train_onehot = vectorizer.fit_transform(X_train_set)
X_test_onehot = vectorizer.transform(X_test_set)

# Baseline 3: Logistic Regression with One-hot vectorization
logreg_onehot = LogisticRegression(max_iter=2000)
logreg_onehot.fit(X_train_onehot, y_train_set)
y_pred_logreg_onehot = logreg_onehot.predict(X_test_onehot)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
print("Baseline 3: Logistic Regression with One-hot vectorization")
print("Accuracy:", accuracy_score(y_test_set, y_pred_logreg_onehot))
print("Precision:", precision_score(y_test_set, y_pred_logreg_onehot, average='macro'))
print("Recall:", recall_score(y_test_set, y_pred_logreg_onehot, average='macro'))
print("F1-score:", f1_score(y_test_set, y_pred_logreg_onehot, average='macro'))


Baseline 3: Logistic Regression with One-hot vectorization
Accuracy: 0.8874932932246176
Precision: 0.7609740991228365
Recall: 0.6880308626013472
F1-score: 0.7156055112479064


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the 'Text' column to strings
my_df['Text'] = my_df['Text'].astype(str)

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Transform the text data into a TF-IDF matrix
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_set)
X_test_tfidf = vectorizer.transform(X_test_set)

# Baseline 4: Logistic Regression with TF-IDF vectorization
logreg_tfidf = LogisticRegression(max_iter=1000)
logreg_tfidf.fit(X_train_tfidf, y_train_set)
y_pred_logreg_tfidf = logreg_tfidf.predict(X_test_tfidf)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
print("Baseline 4: Logistic Regression with TF-IDF vectorization")
print("Accuracy:", accuracy_score(y_test_set, y_pred_logreg_tfidf))
print("Precision:", precision_score(y_test_set, y_pred_logreg_tfidf, average='macro'))
print("Recall:", recall_score(y_test_set, y_pred_logreg_tfidf, average='macro'))
print("F1-score:", f1_score(y_test_set, y_pred_logreg_tfidf, average='macro'))


Baseline 4: Logistic Regression with TF-IDF vectorization
Accuracy: 0.8820399152087677
Precision: 0.7572256001312428
Recall: 0.6496027982372178
F1-score: 0.6816118772721121


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert the 'Text' column to strings
my_df['Text'] = my_df['Text'].astype(str)

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Transform the text data into a one-hot encoded matrix
vectorizer = CountVectorizer(binary=True)
X_train_onehot = vectorizer.fit_transform(X_train_set)
X_test_onehot = vectorizer.transform(X_test_set)

# Baseline 5: SVC Classifier with One-hot vectorization (SVM with RBF kernel, default settings)
SVC_onehot = SVC()
SVC_onehot.fit(X_train_onehot, y_train_set)
y_pred_SVC_onehot = SVC_onehot.predict(X_test_onehot)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
print("Baseline 5: SVC Classifier with One-hot vectorization (SVM with RBF kernel, default settings)")
print("Accuracy:", accuracy_score(y_test_set, y_pred_SVC_onehot))
print("Precision:", precision_score(y_test_set, y_pred_SVC_onehot, average='macro'))
print("Recall:", recall_score(y_test_set, y_pred_SVC_onehot, average='macro'))
print("F1-score:", f1_score(y_test_set, y_pred_SVC_onehot, average='macro'))


In [9]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

# Store the evaluation metrics for each classifier
results = [
    ('Dummy (Most Frequent)', y_test_set, y_pred_dummy_most_freq),
    ('Dummy (Stratified)', y_test_set, y_pred_dummy_strat),
    ('LogReg (One-hot)', y_test_set, y_pred_logreg_onehot),
    ('LogReg (TF-IDF)', y_test_set, y_pred_logreg_tfidf),
    ('SVC (One-hot)', y_test_set, y_pred_SVC_onehot),
]

# Calculate and store the accuracy, precision, recall, and F1-score for each classifier
metrics = []
for classifier_name, y_true, y_pred in results:
    metrics.append(
        (
            classifier_name,
            accuracy_score(y_true, y_pred),
            precision_score(y_true, y_pred, average='macro'),
            recall_score(y_true, y_pred, average='macro'),
            f1_score(y_true, y_pred, average='macro'),
        )
    )

# Create a DataFrame with the evaluation metrics
metrics_df = pd.DataFrame(
    metrics,
    columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1-score'],
)

# Display the evaluation metrics table
print(metrics_df)

# Find the best classifier based on macro F1-score
best_classifier = max(results, key=lambda x: f1_score(x[1], x[2], average='macro'))

# Plot the F1-score for each class of the best classifier
f1_scores = f1_score(best_classifier[1], best_classifier[2], average=None)
class_labels = sorted(list(set(y_test_set)))
plt.bar(class_labels, f1_scores)
plt.xlabel('Class')
plt.ylabel('F1-score')
plt.title(f'F1-score for each class ({best_classifier[0]})')
plt.show()


NameError: name 'y_pred_logreg_onehot' is not defined

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and test sets
X_train_set, X_test_set, y_train_set, y_test_set = train_test_split(my_df['Text'], my_df['reviews'], test_size=0.2, random_state=42)

# Transform the text data into a one-hot encoded matrix
vectorizer = CountVectorizer(binary=True)
X_train_onehot = vectorizer.fit_transform(X_train_set)
X_test_onehot = vectorizer.transform(X_test_set)

# Train a Multinomial Naive Bayes classifier
mnb = MultinomialNB()
mnb.fit(X_train_onehot, y_train_set)

# Evaluate the classifier using accuracy, precision, recall, and F1-score
y_pred_mnb = mnb.predict(X_test_onehot)
print("Multinomial Naive Bayes Classifier with One-hot vectorization")
print("Accuracy:", accuracy_score(y_test_set, y_pred_mnb))
print("Precision:", precision_score(y_test_set, y_pred_mnb, average='macro'))
print("Recall:", recall_score(y_test_set, y_pred_mnb, average='macro'))
print("F1-score:", f1_score(y_test_set, y_pred_mnb, average='macro'))


Multinomial Naive Bayes Classifier with One-hot vectorization
Accuracy: 0.847059133968388
Precision: 0.6609357405634726
Recall: 0.6363340728519842
F1-score: 0.646771994900286


In [21]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define a function to train and evaluate a model with the given parameters
def train_and_evaluate(c, sublinear_tf, max_features, min_df):
    vectorizer = TfidfVectorizer(sublinear_tf=sublinear_tf, max_features=max_features, min_df=min_df)
    X_train_tfidf = vectorizer.fit_transform(X_train_set)
    X_test_tfidf = vectorizer.transform(X_test_set)
    
    logreg_tfidf = LogisticRegression(C=c, max_iter=1000)
    logreg_tfidf.fit(X_train_tfidf, y_train_set)
    y_pred_logreg_tfidf = logreg_tfidf.predict(X_test_tfidf)
    
    accuracy = accuracy_score(y_test_set, y_pred_logreg_tfidf)
    precision = precision_score(y_test_set, y_pred_logreg_tfidf, average='macro',zero_division=0)
    recall = recall_score(y_test_set, y_pred_logreg_tfidf, average='macro')
    f1 = f1_score(y_test_set, y_pred_logreg_tfidf, average='macro')
    
    return accuracy, precision, recall, f1

# Parameter values to try
C_values = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000]
sublinear_tf_values = [True, False]
max_features_values = [None, 5000, 10000, 20000, 50000]
min_df_values = [1, 5, 10, 20]

# Initialize variables to store the best parameters and scores
best_param = None
best_f1 = 0

# Loop through all parameter combinations
for c in C_values:
    for sublinear_tf in sublinear_tf_values:
        for max_features in max_features_values:
            for min_df in min_df_values:
                accuracy, precision, recall, f1 = train_and_evaluate(c, sublinear_tf, max_features, min_df)
                
                if f1 > best_f1:
                    best_param = (c, sublinear_tf, max_features, min_df)
                    best_f1 = f1

# Print the best parameters and scores
print("Best parameters:", best_param)
print("Best F1-score:", best_f1)


KeyboardInterrupt: 

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Define the vectorizer and fit it to the combined training and validation sets
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_set)

# Define the logistic regression model and fit it to the combined training and validation sets
logreg_tfidf = LogisticRegression()
logreg_tfidf.fit(X_train_tfidf, y_train_set)

# Transform the test set using the vectorizer and evaluate the model
X_test_tfidf = vectorizer.transform(X_test_set)
y_pred_logreg_tfidf = logreg_tfidf.predict(X_test_tfidf)

accuracy = accuracy_score(y_test_set, y_pred_logreg_tfidf)
precision = precision_score(y_test_set, y_pred_logreg_tfidf, average='macro', zero_division=0)
recall = recall_score(y_test_set, y_pred_logreg_tfidf, average='macro')
f1 = f1_score(y_test_set, y_pred_logreg_tfidf, average='macro')
conf_mat = confusion_matrix(y_test_set, y_pred_logreg_tfidf)

# Print the evaluation metrics and confusion matrix
print("Accuracy:", accuracy)
print("Macro-averaged precision:", precision)
print("Macro-averaged recall:", recall)
print("Macro-averaged F1-score:", f1)
print("Confusion matrix:\n", conf_mat)


NameError: name 'X_train_set' is not defined