In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from gensim.models import Word2Vec

nltk.download('punkt')

data = pd.read_csv('tweets.csv')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [88]:
tweets = data.iloc[:, 5].tolist()[400000:-400000]
labels = data.iloc[:, 0].tolist()[400000:-400000]

test_tweets = data.iloc[:, 5].tolist()[1595000:]
test_labels = data.iloc[:, 0].tolist()[1595000:]

# Preprocessing with Stemmer and lowercase, Learning embeddings with Word2Vec

In [45]:
# Initialize PorterStemmer
stemmer = PorterStemmer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalpha()]
    return filtered_tokens

preprocessed_tweets = [preprocess(tweet) for tweet in tweets]

# Train a Word2Vec model
model_w2v = Word2Vec(sentences=preprocessed_tweets, vector_size=100, window=5, min_count=1, workers=4)

# Convert texts to embeddings
def text_to_embedding(text):
    embeddings = [model_w2v.wv[word] for word in text if word in model_w2v.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

embeddings = np.array([text_to_embedding(text) for text in preprocessed_tweets])


# Using embeddings with LogReg

In [46]:
X_train, X_test, y_train, y_test = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

In [47]:
model = LogisticRegression(max_iter=1000)

# Train the model using the training data
model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.74031875


# Using bigrams for embeddings

In [48]:
from gensim.models.phrases import Phrases, Phraser

# Detect and create bigrams
phrases = Phrases(preprocessed_tweets, min_count=5, threshold=10)
bigram = Phraser(phrases)
bigram_tweets = [bigram[tweet] for tweet in preprocessed_tweets]

# Train Word2Vec on the bigrams
model_w2v = Word2Vec(sentences=bigram_tweets, vector_size=100, window=5, min_count=1, workers=4)

# Convert texts to embeddings
def text_to_embedding(text):
    embeddings = [model_w2v.wv[word] for word in text if word in model_w2v.wv]
    return np.mean(embeddings, axis=0) if embeddings else np.zeros(100)

embeddings2 = np.array([text_to_embedding(text) for text in preprocessed_tweets])

# LogReg again

In [49]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    embeddings2, labels, test_size=0.2, random_state=42, stratify=labels
)

In [50]:
model2 = LogisticRegression(max_iter=1000)

# Train the model using the training data
model2.fit(X_train2, y_train2)

# Predict the labels for the test set
y_pred2 = model2.predict(X_test2)

# Evaluate the model's performance
accuracy = accuracy_score(y_test2, y_pred2)
print("Accuracy:", accuracy)

Accuracy: 0.7388125


# Using Bigrams with BoW and LogReg

In [54]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

def preprocess(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [stemmer.stem(word) for word in tokens if word.isalpha()]
    return ' '.join(filtered_tokens)

# Create a CountVectorizer instance for bigrams
vectorizer = CountVectorizer(ngram_range=(2, 2))

# Initialize the LogisticRegression model
logreg = LogisticRegression(max_iter=1000)

# Split the dataset into training and testing sets
# Make sure to preprocess the text data before splitting
preprocessed_tweets = [preprocess(tweet) for tweet in tweets]
X_train, X_test, y_train, y_test = train_test_split(preprocessed_tweets, labels, test_size=0.2, random_state=42)

# Create a pipeline that first creates bigram features and then trains a logistic regression model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', logreg)
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict labels for the test set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7876


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK resources if not already available
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

# Preprocessing function that tokenizes and lemmatizes the text
def preprocess(text):
    tokens = word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
    return ' '.join(lemmatized_tokens)

preprocessed_tweets = [preprocess(tweet) for tweet in tweets]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(preprocessed_tweets, labels, test_size=0.2, random_state=42)

# Create a TfidfVectorizer instance for bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # using unigrams and bigrams

# Initialize the LogisticRegression model
logreg = LogisticRegression(max_iter=1000)

# Create a pipeline that first creates TF-IDF features and then trains a logistic regression model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', logreg)
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict labels for the test set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/braedencallaghan/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Accuracy: 0.8234


In [82]:
tweets = data.iloc[:, 5].tolist()[799500:-799500]
labels = data.iloc[:, 0].tolist()[799500:-799500]

# I think this is the ticket for now

In [84]:
import numpy as np


def preprocess(text):
    tokens = word_tokenize(text.lower())
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha()]
    return ' '.join(lemmatized_tokens)

preprocessed_tweets = [preprocess(tweet) for tweet in tweets]



# Create a TfidfVectorizer instance for bigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # using unigrams and bigrams

# Initialize the LogisticRegression model
logreg = LogisticRegression()

# Create a pipeline that first creates TF-IDF features and then trains a logistic regression model
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', logreg)
])

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predict labels for the test set
y_pred = pipeline.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7750
