In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm
import swifter

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

# Load the data
comments_df = pd.read_csv('nyt-comments-2020.csv', low_memory=False)

# Preprocessing function
def preprocess(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove non-alphabetic characters and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text.translate(str.maketrans('', '', string.punctuation)))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    # Join the tokens back into a string
    text = ' '.join(tokens)
    return text

# Apply preprocessing to the comment body column using Swifter for faster processing
comments_df['cleaned_comment'] = comments_df['commentBody'].swifter.apply(lambda x:preprocess(x))
# Save the cleaned data and bag of words matrix to CSV files
comments_df.to_csv('cleaned_data.csv', index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     /N/u/pamal/Carbonate/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /N/u/pamal/Carbonate/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /N/u/pamal/Carbonate/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Pandas Apply:   0%|          | 0/4986461 [00:00<?, ?it/s]

In [20]:

comments_df = pd.read_csv('cleaned_data.csv', low_memory=False)
# Remove rows with missing values

comments_df=comments_df[['cleaned_comment','editorsSelection']]
comments_df['cleaned_comment'].replace({'':np.nan},inplace=True)
comments_df.dropna(inplace=True,axis=0)
comments_df.shape


(4985131, 2)

In [22]:
# # Create a bag of words matrix using CountVectorizer with a minimum document frequency of 2.5%

vectorizer = CountVectorizer(min_df=int(0.025 * len(comments_df)))
bag_of_words_matrix = vectorizer.fit_transform(tqdm(comments_df['cleaned_comment']))
bag_of_words_df = pd.DataFrame.sparse.from_spmatrix(bag_of_words_matrix, columns=vectorizer.get_feature_names())

# Save bag of words matrix to CSV file
bag_of_words_df.to_csv('bag_of_words.csv', index=False)
bag_of_words_df.shape

100%|██████████| 4985131/4985131 [02:19<00:00, 35615.49it/s]


(4985131, 208)

(2983, 24)