In [1]:
import spacy
import re
from nltk import word_tokenize

def matching_curly_braces(text):
    """return True if text has matching curly braces
    in the right order. Otherwise, returns False."""
    stack = []
    for c in text:
        if c == '{':
            stack.append(c)
        elif c == '}':
            if not stack or (c == '}' and stack[-1] != '{'):
                break
            stack.pop()
    return not stack


def remove_curly_braces(text):
    """remove all characters enclosed in curly braces from text"""
    left_bracket = []
    right_bracket = []
    stack = []
    for i, c in enumerate(text):
        if c == '{':  # c is curly brace
            stack.append(c)
            left_bracket.append(i)
        elif c == '}':           # c is closed curly brace
            # string is not valid
            if not stack or \
                (c == '}' and stack[-1] != '{'):
                break
            stack.pop() # pop open curly brace
            right_bracket.append(i+1)
    for _ in range(len(left_bracket)):
        shift = 0
        left_index = 0
        right_index = 0
        for left_index in range(len(left_bracket)):
            if left_bracket[left_index] > right_bracket[right_index]:
                left_index -= 1
                break
        text = text[0:left_bracket[left_index]] + text[right_bracket[right_index]:]
        shift = right_bracket[right_index] - left_bracket[left_index]
        for n in range(len(left_bracket)):
            if right_bracket[n] > right_bracket[right_index]:
                right_bracket[n] -= shift 
            if left_bracket[n] > left_bracket[left_index]:
                left_bracket[n] -= shift 
        left_bracket.pop(left_index)
        right_bracket.pop(right_index)
    return text

def remove_smart_quotes(text):
    """remove quotes in text"""
    return text.replace("“", "\"").replace("”","\"")

def clean_data(input_name, output_name):
    # read input file
    if input_name != "data.txt":
        with open(input_name, "r", encoding="utf8") as input_file:
            text = input_file.readlines()
            alt_text = ""
            for line in text:
                alt_text += line + "\n"
            if matching_curly_braces(alt_text):
                # remove text enclosed in curly_braces to remove random html code
                alt_text = remove_curly_braces(alt_text)
                text = alt_text
    else:
        with open(input_name, "r") as input_file:
            text = input_file.readlines()
            alt_text = ""
            for line in text:
                alt_text += line + "\n"
            if matching_curly_braces(alt_text):
                # remove text enclosed in curly_braces to remove random html code
                alt_text = remove_curly_braces(alt_text)
                text = alt_text

    text = re.sub("U.S.", "United States ", text)
    text = re.sub("p.m.", "pm ", text)
    text = re.sub("a.m.", "am ", text)
    text = re.sub("E.A.T.", "eat ", text)
    text = re.sub("-", " ", text)
    text = re.sub(r'([A-Za-z])(\d)', r'\1 \2', text)
    text = re.sub(r'(\d)([A-Za-z])', r'\1 \2', text)
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    nlp = spacy.load('en_core_web_sm')

    # split text into lines based on if it has a period, question, exclamation, or newline character
    text = re.split(r'[.?!]|\\n', text)

    # set list of stopwords
    stopwords = nlp.Defaults.stop_words
    html_stopwords = ["var","https", "csrftoken", "userdata", "csmllty54qx20erutnfcgs839jd2y", "const", "saml","getitem", "firebaseat", "firebaseapp", "json"]
    stopwords.update(html_stopwords)
    # write to output file
    with open(output_name, "w") as output_file:
        for line in text:
            # remove punctuation and stopwords from line
            tokenized_line = [word.lower() for word in word_tokenize(line) if word.isalnum() and word.lower() not in stopwords]
            newline = ""
            for word in tokenized_line:
                newline += word + " "
            if newline.strip():
                output_file.write(newline.strip() + "\n")

# clean_data("all_data.txt", "all_cleaned_data.txt")
clean_data("data.txt", "cleaned_data.txt")
# clean_data("test.txt", "cleaned_test.txt")
# clean_data("data2.txt", "cleaned_data2.txt")

In [2]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.model_selection import train_test_split
sentences = []  # this column has all of the sentences in the dataset
categories = []  # this column will have a 1 if it contains a question word, 0 otherwise
labels = [0, 1]
with open("cleaned_data.txt", "r") as file:
    sentences = file.readlines()
for sent in sentences:
    categories.append(0)
    for word in ["school", "student", "graduate", "learn", "class", "university"]:
        if word in sent:
            categories.pop()
            categories.append(1)
            break
sentences_train, sentences_test, sentiment_train, sentiment_test = train_test_split(sentences,
categories,test_size = 0.20,random_state = 42)

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
# initialize TfidfVectorizer to create the tfIdf representation of the corpus
# the parameters are: min_df -- the percentage of documents that the word has
# to occur in to be considered, the tokenizer to use, and the maximum
# number of words to consider (max_features)
max_tokens = 10
vectorizer = TfidfVectorizer(min_df = .1,tokenizer = nltk.word_tokenize,max_features = max_tokens)

# fit and transform the text into tfidf format, using training text
# here is where we build the tfidf representation of the training data
sentences_train_tfidf = vectorizer.fit_transform(sentences_train)

# the feature names are the words (tokens) in the dataset
tfidf_tokens = vectorizer.get_feature_names_out()

final_vectors = pd.DataFrame(
    data = sentences_train_tfidf.toarray(), 
    columns = tfidf_tokens)

final_vectors.to_csv("tf_idf_vectors.csv")



In [4]:
# train the naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
# Initialize the classifier and train it
classifier = MultinomialNB()
classifier.fit(sentences_train_tfidf, sentiment_train)

# find accuracy based on test set
sentences_test_tfidf = vectorizer.fit_transform(sentences_test)
# for each document in the test data, use the classifier to predict whether its sentiment is positive or negative
sentiment_pred = classifier.predict(sentences_test_tfidf)
score = sklearn.metrics.accuracy_score(sentiment_test,sentiment_pred)

# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(sentiment_test,sentiment_pred,normalize=None)
print(conf_matrix)
with open("naive_bayes_classification.txt", "w") as output_file:
    output_file.write(str(score))
    output_file.write(str(conf_matrix))

[[14 59]
 [ 5 71]]




In [5]:
# SVM classification
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# the directory root will be wherever the movie review data is located
# modify the path as appropriate for your system
# directory_root = "./movie_reviews/"
# movie_reviews = load_files(directory_root,
# encoding='utf-8',decode_error="replace")
# count the number of reviews in each category
labels, counts = np.unique(categories,
return_counts=True)
print(labels)
# convert review_data.target_names to np array
labels_str = np.array(labels)[labels]
print(dict(zip(labels_str, counts)))

from sklearn.model_selection import train_test_split
sentences_train, sentences_test, sentiment_train, sentiment_test = train_test_split(sentences,
categories, test_size = 0.20, random_state = 42)

[0 1]
{0: 365, 1: 378}


In [6]:
# We will work with a TF_IDF representation, as before
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

# Use the Pipeline function to construct a sequence of two processes
# to run, one after the other -- the vectorizer and the classifier
svc_tfidf = Pipeline([
("tfidf_vectorizer", TfidfVectorizer(
stop_words = "english", max_features=1000)),
("linear svc", SVC(kernel="linear"))
])

model = svc_tfidf
model.fit(sentences_train, sentiment_train)
sentiment_pred = model.predict(sentences_test)
accuracy_result = accuracy_score( sentiment_test,
sentiment_pred)
print(accuracy_result)
# View the results as a confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(sentiment_test,
sentiment_pred,normalize=None)
print(conf_matrix)
with open("svc_classification.txt", "w") as output_file:
    output_file.write(str(accuracy_result))
    output_file.write(str(conf_matrix))

0.9194630872483222
[[65  8]
 [ 4 72]]
