In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [5]:
len(dataset)

1000

In [7]:
dataset.isnull().sum()

Review    0
Liked     0
dtype: int64

In [9]:
# Cleaning the texts
import re
import nltk
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
corpus = []
for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ',dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
review

'wast enough life pour salt wound draw time took bring check'

In [15]:

# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [19]:
#from sklearn.svm import SVC
#classifier = SVC()
#classifier.fit(X_train,y_train)


#from sklearn.naive_bayes import GaussianNB
#classifier = GaussianNB()
#classifier.fit(X_train, y_train)


#from sklearn.neighbors import KNeighborsClassifier
#classifier = KNeighborsClassifier(n_neighbors=5)
#classifier.fit(X_train, y_train)

#from sklearn.naive_bayes import MultinomialNB
#classifier = MultinomialNB()
#classifier.fit(X_train, y_train)


from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

     


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [21]:
y_pred = classifier.predict(X_test)

In [23]:
from sklearn import metrics
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.67      0.78      0.72        97
           1       0.76      0.64      0.69       103

    accuracy                           0.71       200
   macro avg       0.72      0.71      0.71       200
weighted avg       0.72      0.71      0.71       200



In [25]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[76 21]
 [37 66]]


In [27]:
print(metrics.accuracy_score(y_test,y_pred))

0.71


In [75]:
import pandas as pd

# Load your manually created CSV file
df = pd.read_csv("gr_cr_comments.csv")

# Display first few rows
print(df.head())

                             comment label
0  CR works very hard for the class.    cr
1    CR helps everyone during exams.    cr
2    CR is very kind and supportive.    cr
3   CR manages everything very well.    cr
4  CR speaks politely with teachers.    cr


In [77]:
positive_words = [
    "hard", "helps", "helpful", "kind", "supportive", "manages",
    "polite", "great", "good", "friendly", "respectful",
    "studies", "motivates", "excellent", "well", "job", "doing"
]

negative_words = [
    "lazy", "late", "angry", "argues", "disappointed", "doesn't",
    "never", "missing", "forget", "forgets", "worst", "boring",
    "problems", "doesnt", "not"
]

positive_phrases = ["works very hard", "helps everyone", "studies hard", "great job", "well done"]
negative_phrases = ["doesn't listen", "always comes late", "never completes", "very lazy", "argues often"]


In [79]:
import re

def tokenize(text):
    text = text.lower()
    text = text.replace("’", "'")
    text = re.sub(r"[^a-z0-9'\s]", " ", text)
    words = text.split()
    return words


In [81]:
def simple_sentiment(comment):
    words = tokenize(comment)
    pos = sum(1 for w in words if w in positive_words)
    neg = sum(1 for w in words if w in negative_words)
    score = pos - neg
    if score > 0:
        return "positive", score
    elif score < 0:
        return "negative", score
    else:
        return "neutral", score

# small test
print(simple_sentiment("CR works very hard for the class."))  # expects positive
print(simple_sentiment("GR always comes late to class."))     # expects negative


('positive', 1)
('negative', -1)


In [83]:
# Apply and create columns
df[["predicted_sentiment", "score"]] = df["comment"].apply(
    lambda t: pd.Series(simple_sentiment(t))
)

# Map original labels to sentiment words (cr -> negative, gr -> positive)
label_to_sentiment = {"cr": "negative", "gr": "positive"}
df["true_sentiment"] = df["label"].map(label_to_sentiment)

# Save results
df.to_csv("gr_cr_comments_with_sentiment.csv", index=False)
print("Saved: gr_cr_comments_with_sentiment.csv")
print(df)


Saved: gr_cr_comments_with_sentiment.csv
                                     comment label predicted_sentiment  score  \
0          CR works very hard for the class.    cr            positive      1   
1            CR helps everyone during exams.    cr            positive      1   
2            CR is very kind and supportive.    cr            positive      2   
3           CR manages everything very well.    cr            positive      2   
4          CR speaks politely with teachers.    cr             neutral      0   
5      CR sometimes forgets important tasks.    cr            negative     -1   
6         CR should be more active in class.    cr             neutral      0   
7                     CR gets angry quickly.    cr            negative     -1   
8           CR is doing a great job overall.    cr            positive      3   
9   CR doesn’t listen to students sometimes.    cr            negative     -1   
10            GR always comes late to class.    gr            negati

In [85]:
# Treat neutral as incorrect for simple accuracy
df["correct"] = df["predicted_sentiment"] == df["true_sentiment"]
accuracy = df["correct"].mean()
print(f"Accuracy: {accuracy:.2%}")

# Confusion table (true rows, predicted cols)
confusion = pd.crosstab(df["true_sentiment"], df["predicted_sentiment"], rownames=["true"], colnames=["pred"], dropna=False)
print("\nConfusion matrix:\n", confusion)

# Show any neutral cases for manual review
neutrals = df[df["predicted_sentiment"] == "neutral"]
if not neutrals.empty:
    print("\nNeutral predictions (review these to improve lexicon):")
    print(neutrals[["comment", "label", "predicted_sentiment", "score"]])


Accuracy: 40.00%

Confusion matrix:
 pred      negative  neutral  positive
true                                 
negative         3        2         5
positive         5        0         5

Neutral predictions (review these to improve lexicon):
                              comment label predicted_sentiment  score
4   CR speaks politely with teachers.    cr             neutral      0
6  CR should be more active in class.    cr             neutral      0
