In [9]:
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

def load_lexicon(filename):
    lexicon = {}
    with open(filename, "r") as file:
        for line in file:
            word, score = line.split('\t')
            lexicon[word.strip()] = int(score.strip())
    return lexicon

def load_reviews(filename):
    with open(filename, "r", encoding='utf-8', errors='ignore') as file:
        reviews = file.read().splitlines()
    return reviews

def calculate_sentiment(review, lexicon):
    words = review.split()
    score = sum(lexicon.get(word, 0) for word in words)
    return score

lexicon = load_lexicon("AFINN-111.txt")
texts_neg = load_reviews("rt-polarity.neg")
texts_pos = load_reviews("rt-polarity.pos")

sentiment_scores_neg = [calculate_sentiment(review, lexicon) for review in texts_neg]
sentiment_scores_pos = [calculate_sentiment(review, lexicon) for review in texts_pos]

texts = texts_neg + texts_pos
sentiment_scores = sentiment_scores_neg + sentiment_scores_pos
labels = [0] * len(texts_neg) + [1] * len(texts_pos)

X = np.array(sentiment_scores).reshape(-1, 1)
y = np.array(labels)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.630098452883263
Confusion Matrix:
 [[702 360]
 [429 642]]


In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

svm_model = SVC()
rf_model = RandomForestClassifier()
gb_model = GradientBoostingClassifier()

models = {
    'Logistic Regression': model,
    'SVM': svm_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model
}

def train_evaluate(models, X_train, y_train, X_test, y_test):
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        conf_matrix = confusion_matrix(y_test, y_pred)
        print(f"{name} Model")
        print("Accuracy:", accuracy)
        print("Confusion Matrix:\n", conf_matrix, '\n')

train_evaluate(models, X_train, y_train, X_test, y_test)

Logistic Regression Model
Accuracy: 0.630098452883263
Confusion Matrix:
 [[702 360]
 [429 642]] 

SVM Model
Accuracy: 0.627754336615096
Confusion Matrix:
 [[784 278]
 [516 555]] 

Random Forest Model
Accuracy: 0.630098452883263
Confusion Matrix:
 [[702 360]
 [429 642]] 

Gradient Boosting Model
Accuracy: 0.630098452883263
Confusion Matrix:
 [[702 360]
 [429 642]] 



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

def preprocess_reviews(texts_neg, texts_pos):
    return [" ".join(review.split()) for review in texts_neg + texts_pos]

all_reviews = preprocess_reviews(texts_neg, texts_pos)
labels = [0] * len(texts_neg) + [1] * len(texts_pos)

tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(all_reviews)

model_lr = Pipeline([('tfidf', TfidfVectorizer()), ('lr', LogisticRegression())])
model_svm = Pipeline([('tfidf', TfidfVectorizer()), ('svm', SVC())])
model_rf = Pipeline([('tfidf', TfidfVectorizer()), ('rf', RandomForestClassifier())])
model_gb = Pipeline([('tfidf', TfidfVectorizer()), ('gb', GradientBoostingClassifier())])

param_grid_lr = {'lr__C': [0.1, 1, 10]}
param_grid_svm = {'svm__C': [0.1, 1, 10]}
param_grid_rf = {'rf__n_estimators': [10, 100, 1000]}
param_grid_gb = {'gb__n_estimators': [100, 200], 'gb__learning_rate': [0.1, 0.01]}

def grid_search_cv(model, param_grid, X, y):
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print(f"Best Parameters: {best_params}")
    print(f"Best Cross-Validation Score: {best_score}")

print("\nLogistic Regression")
grid_search_cv(model_lr, param_grid_lr, all_reviews, labels)

print("\nSVM")
grid_search_cv(model_svm, param_grid_svm, all_reviews, labels)

print("\nRandom Forest")
grid_search_cv(model_rf, param_grid_rf, all_reviews, labels)

print("\nGradient Boosting")
grid_search_cv(model_gb, param_grid_gb, all_reviews, labels)

Logistic Regression
Best Parameters: {'lr__C': 10}
Best Cross-Validation Score: 0.7699308375751723

SVM
Best Parameters: {'svm__C': 10}
Best Cross-Validation Score: 0.777997016419369

Random Forest
Best Parameters: {'rf__n_estimators': 1000}
Best Cross-Validation Score: 0.7105629045579648

Gradient Boosting
Best Parameters: {'gb__learning_rate': 0.1, 'gb__n_estimators': 200}
Best Cross-Validation Score: 0.6804533687985371


In [19]:
all_reviews = texts_neg + texts_pos
labels = [0] * len(texts_neg) + [1] * len(texts_pos)

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(all_reviews)

model = LogisticRegression(C=10)
model.fit(X_tfidf, labels)

print('Number of negative reviews:', len(texts_neg))
for review in texts_neg[:5]:
    print('\nNegative Review:', review)

print('\nNumber of positive reviews:', len(texts_pos))
for review in texts_pos[:5]:
    print('\nPositive Review:', review)

all_reviews = texts_neg + texts_pos

sample_reviews = texts_neg[:5] + texts_pos[:5]
sample_reviews_transformed = tfidf_vectorizer.transform(sample_reviews)
predictions = model.predict(sample_reviews_transformed)


for review, prediction in zip(sample_reviews, predictions):
    sentiment = 'Positive' if prediction == 1 else 'Negative'
    print('\nReview:', review)
    print('Predicted Sentiment:', sentiment)

Number of negative reviews: 5331

Negative Review: simplistic , silly and tedious . 

Negative Review: it's so laddish and juvenile , only teenage boys could possibly find it funny . 

Negative Review: exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

Negative Review: [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

Negative Review: a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 

Number of positive reviews: 5331

Positive Review: the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

Positive Review: the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-w