# sentiment analysis on text data using various NLP techniques and machine learning models. It encompasses data loading, text preprocessing, and the application of different models, including logistic regression and linear SVM, with variations in feature extraction methods such as CountVectorizer, TF-IDF, and n-grams. The code evaluates model performance on validation sets and provides the final accuracy on a test set

In [1]:
import numpy as np
import pandas as pd
import os , re
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
english_stop_words = stopwords.words('english')

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer



In [2]:
reviews_train = []
for line in open(r'sentiment_analytic/train.txt', 'r',encoding = 'utf-8'):
    
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open(r'sentiment_analytic/test.txt', 'r',encoding = 'utf-8'):
    
    reviews_test.append(line.strip())


# Create target variable with equal distribution of positive (1) and negative (0) samples
target = [1] * (len(reviews_train) // 2) + [0] * (len(reviews_train) // 2)


In [3]:
REPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])|(\d+)")
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
NO_SPACE = ""
SPACE = " "

def preprocess_reviews(reviews):
    
    reviews = [REPLACE_NO_SPACE.sub(NO_SPACE, line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(SPACE, line) for line in reviews]
    
    return reviews

reviews_train_clean = preprocess_reviews(reviews_train)
reviews_test_clean = preprocess_reviews(reviews_test)



# target = [1] * len(reviews_train_clean) // 2 + [0] * len(reviews_train_clean) // 2


# baseline

In [4]:


 
baseline_vectorizer = CountVectorizer(binary=True)
X_baseline = baseline_vectorizer.fit_transform(reviews_train_clean)

X_train, X_val, y_train, y_val = train_test_split(X_baseline, target, train_size=0.75, random_state=42)


for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    accuracy = accuracy_score(y_val, lr.predict(X_val))
    print(f"Accuracy for C={c}: {accuracy}")


Accuracy for C=0.01: 0.5096153846153846
Accuracy for C=0.05: 0.5288461538461539
Accuracy for C=0.25: 0.5192307692307693
Accuracy for C=0.5: 0.5096153846153846
Accuracy for C=1: 0.5


In [9]:
final_model = LogisticRegression(C=0.05)
final_model.fit(X_baseline, target)



# Vectorize the test data using the same vectorizer
X_test_baseline = baseline_vectorizer.transform(reviews_test_clean)

# Ensure that the number of samples in X_test_baseline matches the length of target
if X_test_baseline.shape[0] != len(target):
    print("Number of samples in test data and target do not match.")
    # Handle the inconsistency, such as re-checking your test data.

# Continue with your predictions
y_pred_test = final_model.predict(X_test_baseline)

# Ensure that the length of target and predictions match
if len(target) != len(y_pred_test):
    print("Number of samples in target and predictions do not match.")
    # Handle the inconsistency, such as re-checking your data or predictions.
else:
    # Continue with accuracy calculation
    print("Final Accuracy:", accuracy_score(target, y_pred_test))




Number of samples in test data and target do not match.
Number of samples in target and predictions do not match.


# remove Stop Words

In [6]:
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split()  if word not in english_stop_words]))
    return removed_stop_words

no_stop_words_train = remove_stop_words(reviews_train_clean)
no_stop_words_test = remove_stop_words(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(no_stop_words_train)
X = cv.transform(no_stop_words_train)
X_test = cv.transform(no_stop_words_test)

X_train, X_val, y_train, y_val = train_test_split( X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.4326923076923077
Accuracy for C=0.05: 0.4423076923076923
Accuracy for C=0.25: 0.46153846153846156
Accuracy for C=0.5: 0.46153846153846156
Accuracy for C=1: 0.4519230769230769


# stemming

In [7]:
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()

    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

stemmed_reviews_train = get_stemmed_text(reviews_train_clean)
stemmed_reviews_test = get_stemmed_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(stemmed_reviews_train)
X = cv.transform(stemmed_reviews_train)
X_test = cv.transform(stemmed_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))   


Accuracy for C=0.01: 0.5
Accuracy for C=0.05: 0.5096153846153846
Accuracy for C=0.25: 0.5288461538461539
Accuracy for C=0.5: 0.5192307692307693
Accuracy for C=1: 0.5096153846153846


In [8]:
final_stemmed = LogisticRegression(C=0.05)
final_stemmed.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_stemmed.predict(X_test)))

ValueError: Found input variables with inconsistent numbers of samples: [414, 104]

# lemmatization

In [10]:
def get_lemmatized_text(corpus):
    
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

lemmatized_reviews_train = get_lemmatized_text(reviews_train_clean)
lemmatized_reviews_test = get_lemmatized_text(reviews_test_clean)

cv = CountVectorizer(binary=True)
cv.fit(lemmatized_reviews_train)
X = cv.transform(lemmatized_reviews_train)
X_test = cv.transform(lemmatized_reviews_test)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    

Accuracy for C=0.01: 0.5288461538461539
Accuracy for C=0.05: 0.49038461538461536
Accuracy for C=0.25: 0.47115384615384615
Accuracy for C=0.5: 0.49038461538461536
Accuracy for C=1: 0.4807692307692308


In [11]:
final_lemmatized = LogisticRegression(C=0.25)
final_lemmatized.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_lemmatized.predict(X_test)))

ValueError: Found input variables with inconsistent numbers of samples: [414, 104]

# TF-IDF

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(reviews_train_clean)
X = tfidf_vectorizer.transform(reviews_train_clean)
X_test = tfidf_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))


Accuracy for C=0.01: 0.79488
Accuracy for C=0.05: 0.83584
Accuracy for C=0.25: 0.87568
Accuracy for C=0.5: 0.8856
Accuracy for C=1: 0.89328


In [None]:
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_tfidf.predict(X_test)))

Final Accuracy: 0.88204


# Support Vector Machines (SVM)

In [1]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(reviews_train_clean)
X = ngram_vectorizer.transform(reviews_train_clean)
X_test = ngram_vectorizer.transform(reviews_test_clean)

X_train, X_val, y_train, y_val = train_test_split( X, target, train_size = 0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, svm.predict(X_val))))
    

NameError: name 'CountVectorizer' is not defined

In [None]:
final_svm_ngram = LinearSVC(C=0.01)
final_svm_ngram.fit(X, target)
print ("Final Accuracy: %s" 
       % accuracy_score(target, final_svm_ngram.predict(X_test)))

Final Accuracy: 0.8974
