In [219]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

data = pd.read_csv('tweets.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [220]:
# There are 1,599,998 Tweets (There was some problem reading in one so I removed one of the opposite class to balance)
tweets = data.iloc[:, 5].tolist()[:-1]
labels = data.iloc[:, 0].tolist()[:-1]

# These are the middle 5,000 tweets
test_tweets = tweets[797499:-797499]
test_labels = labels[797499:-797499]

# These are the tweets with the middle 5,000 removed
train_tweets = tweets[:797499] + tweets[-797499:]
train_labels = labels[:797499] + labels[-797499:]

In [221]:
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    # lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
    # return ' '.join(lemmatized_tokens)
    return ' '.join(tokens)

vectorizer = TfidfVectorizer(ngram_range=(1, 2)) 

test_tweets = [preprocess(tweet) for tweet in test_tweets]


In [222]:
def evaluate_dataset(input_tweets, input_labels):
    preprocessed_tweets = [preprocess(tweet) for tweet in input_tweets]
    # preprocessed_tweets = input_tweets
        
    logreg = LogisticRegression(max_iter=1000)

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', logreg)
    ])

    pipeline.fit(preprocessed_tweets, input_labels)

    # Predict labels for the test set
    y_pred = pipeline.predict(test_tweets)

    # Calculate the accuracy
    accuracy = accuracy_score(test_labels, y_pred)
    precision = precision_score(test_labels, y_pred, pos_label=4)
    recall = recall_score(test_labels, y_pred, pos_label=4)

    return accuracy, precision, recall

# 1,000 Training Tweets

In [223]:
accuracy, precision, recall = evaluate_dataset(train_tweets[796999:-796999], train_labels[796999:-796999])
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')

Accuracy: 0.7288
Precision: 0.7631
Recall: 0.6636
