In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load data
file_path = 'reviews.csv'  
data = pd.read_csv(file_path, delimiter='\t')

In [3]:
data.head()

Unnamed: 0,Name,RatingValue,DatePublished,Review
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...


In [4]:
# Step 1: Binning the Ratings into Sentiment Categories
def bin_ratings(df):
    df['Sentiment'] = df['RatingValue'].apply(lambda x: 0 if x in [1, 2] else (1 if x == 3 else 2))
    return df

In [5]:
data = bin_ratings(data)

In [6]:
data.head()

Unnamed: 0,Name,RatingValue,DatePublished,Review,Sentiment
0,Playa Cabana,4,2020-02-26,I was tasked with finding a spot for a group d...,2
1,Playa Cabana,3,2019-08-04,Went here with my friends and family. I liked ...,1
2,Playa Cabana,3,2019-08-24,Surprisingly good Flautas! They came as 3 roll...,1
3,Playa Cabana,4,2019-06-06,As a Mexican I always crave authentic Mexican ...,2
4,Playa Cabana,5,2020-05-25,Best tacos I've ever had. Both locations are g...,2


In [7]:
data['Number'] = range(1, len(data) + 1)

In [8]:
# Select only the relevant columns: Number, Sentiment, and Review
final_data = data[['Number', 'Sentiment', 'Review']]

In [9]:
print(final_data.head())

   Number  Sentiment                                             Review
0       1          2  I was tasked with finding a spot for a group d...
1       2          1  Went here with my friends and family. I liked ...
2       3          1  Surprisingly good Flautas! They came as 3 roll...
3       4          2  As a Mexican I always crave authentic Mexican ...
4       5          2  Best tacos I've ever had. Both locations are g...


In [10]:
# Step 2: Balancing the Data
# Count the occurrences of each sentiment category
count_pos = len(final_data[final_data['Sentiment'] == 2])
count_neg = len(final_data[final_data['Sentiment'] == 0])
count_neutral = len(final_data[final_data['Sentiment'] == 1])

In [11]:
count_pos,count_neutral,count_neg

(1465, 297, 158)

In [12]:
# Drop excess positive reviews to balance the dataset
data_positive = final_data[final_data['Sentiment'] == 2].sample(n=min(count_neg, count_neutral), random_state=42)
data_negative = final_data[final_data['Sentiment'] == 0]
data_neutral = final_data[final_data['Sentiment'] == 1]

In [13]:
print(data_positive.shape[0])

158


In [14]:
data_positive,data_negative,data_neutral

(      Number  Sentiment                                             Review
 1265    1266          2  Solid. I always get the grilled pork banh mi, ...
 211      212          2  Delicious tacos. I appreciated the 3 for $13 s...
 359      360          2  I live in Chicago and I'm spoiled with delicio...
 701      702          2  Dropped by Sugo on the west end for some late ...
 1137    1138          2  We sat at the patio which has great street vie...
 ...      ...        ...                                                ...
 280      281          2  This place is authentic (in my non mexican opi...
 1405    1406          2  And still and absolute belter of a Chinese, th...
 872      873          2  The food was sooooo amazing! And the service w...
 152      153          2  Just love el trompo\nOur server was nice, and ...
 920      921          2  Encountered this cozy and casual Italian spot ...
 
 [158 rows x 3 columns],
       Number  Sentiment                                     

In [15]:
balanced_data = pd.concat([data_positive, data_negative, data_neutral])

In [16]:
balanced_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 613 entries, 1265 to 1880
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Number     613 non-null    int64 
 1   Sentiment  613 non-null    int64 
 2   Review     613 non-null    object
dtypes: int64(2), object(1)
memory usage: 19.2+ KB


In [17]:
# Step 3: Splitting the Data into Training and Validation Sets
train_data, valid_data = train_test_split(balanced_data, test_size=0.2, random_state=42)

In [18]:
# Save the train and validation sets
train_data.to_csv('train.csv', index=False)
valid_data.to_csv('valid.csv', index=False)

In [19]:
# Step 4: Model Training and Evaluation
vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['Review'])
y_train = train_data['Sentiment']


In [20]:
# Train a Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [21]:
# Define the evaluation function
def evaluate(file_name):
    data = pd.read_csv(file_name)
    X_valid = vectorizer.transform(data['Review'])
    y_valid = data['Sentiment']

    y_pred = model.predict(X_valid)

    accuracy = accuracy_score(y_valid, y_pred)
    f1 = f1_score(y_valid, y_pred, average='macro')
    class_f1 = f1_score(y_valid, y_pred, average=None)
    conf_matrix = confusion_matrix(y_valid, y_pred, normalize='true')

     # Print the performance metrics
    print(f"Accuracy: {accuracy}")
    print(f"Average F1 Score (Macro): {f1}")
    print(f"Class-wise F1 Scores:")
    print(f"Negative: {class_f1[0]}")
    print(f"Neutral: {class_f1[1]}")
    print(f"Positive: {class_f1[2]}")
    print("Confusion Matrix (Normalized):")
    print(conf_matrix)


In [22]:
# Call the evaluation function on the validation data
evaluate('valid.csv')

Accuracy: 0.5934959349593496
Average F1 Score (Macro): 0.591087708058036
Class-wise F1 Scores:
Negative: 0.5538461538461539
Neutral: 0.6017699115044248
Positive: 0.6176470588235294
Confusion Matrix (Normalized):
[[0.48648649 0.40540541 0.10810811]
 [0.16666667 0.62962963 0.2037037 ]
 [0.03125    0.3125     0.65625   ]]
