In [1]:
import pandas as pd

In [16]:
# cols = ['sample_text','sentiment_analysis']

In [28]:
df = pd.read_csv(
    "data.csv",
    header=None,
    engine="python",
    names=['review','sentiment'],
    on_bad_lines="skip"
)

In [29]:
df.head()

Unnamed: 0,review,sentiment
0,"Yes, I am just going to tell you about this on...",negative
1,I hope that Matt Dorff's original script for t...,negative
2,I think they really let the quality of the DVD...,negative
3,Oliver Stone is not one to shy away from a mov...,positive
4,Oh my god! The Beeb hit a new low with this gu...,negative


In [31]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,22638
negative,22600


In [32]:
df['sentiment'] = df['sentiment'].map({'positive':1,'negative':0})

In [33]:
df.head()

Unnamed: 0,review,sentiment
0,"Yes, I am just going to tell you about this on...",0
1,I hope that Matt Dorff's original script for t...,0
2,I think they really let the quality of the DVD...,0
3,Oliver Stone is not one to shy away from a mov...,1
4,Oh my god! The Beeb hit a new low with this gu...,0


In [34]:
x = df['review']
y = df['sentiment']

In [37]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2,stratify=y)

In [45]:
x_train.isnull().sum()

np.int64(0)

In [50]:
#apply tf--idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    min_df=2,          # reduce noise
    max_df=0.9,
    sublinear_tf=True  # dampen repeated positives
)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)

In [51]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
model.fit(x_train_tfidf,y_train)

In [52]:
from sklearn.metrics import classification_report,roc_auc_score
y_pred = model.predict(x_test_tfidf)
y_prob = model.predict_proba(x_test_tfidf)[:,1]

In [53]:
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       0.92      0.90      0.91      4520
           1       0.90      0.92      0.91      4528

    accuracy                           0.91      9048
   macro avg       0.91      0.91      0.91      9048
weighted avg       0.91      0.91      0.91      9048

ROC-AUC: 0.9698461783514181


In [54]:
test_reviews = [
    "The movie was absolutely fantastic, every scene was engaging and well acted.",
    "The movie was not good at all.",
    "I did not hate this movie.",
    "The story was interesting but the acting was disappointing.",
    "I fell asleep twice, but sure, it was a masterpiece."
]

test_vec = tfidf.transform(test_reviews)
preds = model.predict(test_vec)
probs = model.predict_proba(test_vec)[:, 1]

for review, pred, prob in zip(test_reviews, preds, probs):
    print(f"\nReview: {review}")
    print(f"Prediction: {'Positive' if pred == 1 else 'Negative'}")
    print(f"Confidence: {prob:.2f}")




Review: The movie was absolutely fantastic, every scene was engaging and well acted.
Prediction: Positive
Confidence: 0.79

Review: The movie was not good at all.
Prediction: Negative
Confidence: 0.04

Review: I did not hate this movie.
Prediction: Negative
Confidence: 0.17

Review: The story was interesting but the acting was disappointing.
Prediction: Negative
Confidence: 0.05

Review: I fell asleep twice, but sure, it was a masterpiece.
Prediction: Positive
Confidence: 0.51
