In [4]:
INPUT = 'processced df.csv'
OUTPUT = 'processced text.csv'
TFIDF_MAT="tfidf_matrix.npz"
BOW_MAT="bow_matrix.npz"
COUNT_VEC='out/count_vectorizer.pkl'
TFIDF_VEC="out/tfidf_vectorizer.pkl"
# INPUT="../api/feb2024dataset/processed_articles.csv"
# OUTPUT="../api/feb2024dataset/processed_text.csv"
# TFIDF_MAT="../api/feb2024dataset/tfidf_matrix.npz"
# BOW_MAT="../api/feb2024dataset/bow_matrix.npz"
# COUNT_VEC="../api/feb2024dataset/out/count_vectorizer.pkl"
# TFIDF_VEC="../api/feb2024dataset/out/tfidf_vectorizer.pkl"

In [5]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df = pd.read_csv(INPUT)
df.head()

Unnamed: 0,type_of_material,news_desk,section_name,combined_text,author
0,News,Learning,The Learning Network,This word has appeared in 993 articles on NYTi...,The Learning Network
1,News,Arts & Culture,Movies,"With the victory, the Christopher Nolan biopic...",Kyle Buchanan
2,Quote,Summary,Corrections,"Quotation of the Day for Monday, February 26, ...",
3,News,Corrections,Corrections,"No corrections appeared in print on Monday, Fe...",
4,News,Arts & Culture,Arts,FX premieres a new show set in Japan in the 16...,Shivani Gonzalez


### Text Preprocessing Function: Combined_text

In [7]:
# load the spaCy model
nlp = spacy.load('en_core_web_sm')
# preprocessing function
def preprocess_text(text):
    # lowercase text
    text = text.lower()
    # remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize and lemmatize the text
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english')]
    # join the tokens back into a string
    return ' '.join(tokens)

# apply this function to 'combined_text' 
df['preprocessed_text'] = df['combined_text'].apply(preprocess_text)

df = df.drop('combined_text', axis=1)
# Now the 'preprocessed_text' column will have the cleaned and preprocessed text.


In [8]:
df.head()

Unnamed: 0,type_of_material,news_desk,section_name,author,preprocessed_text
0,News,Learning,The Learning Network,The Learning Network,word appear article nytimescom past year use...
1,News,Arts & Culture,Movies,Kyle Buchanan,victory christopher nolan biopic sweep guild p...
2,Quote,Summary,Corrections,,quotation day monday february understand fl...
3,News,Corrections,Corrections,,correction appear print monday feb error co...
4,News,Arts & Culture,Arts,Shivani Gonzalez,fx premiere new show set japan new miniserie s...


In [9]:
print(df.loc[4, "preprocessed_text"])

fx premiere new show set japan new miniserie star kate winslet air hbo like still not cut cord selection cable network tv show movie special broadcast week feb march   detail time subject change tv week shogun regime television movie


### Vectorization: Using BoW to Transform Preprocessed Text Data for LDA

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
#create a countvectorizer instance
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
#apply it to preprocessed text
bow_matrix = vectorizer.fit_transform(df['preprocessed_text'])

### Vectorization: Using TF-IDF to Transform Preprocessed Text Data for NMF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate the TF-IDF 
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
tfidf_matrix = tfidf_vect.fit_transform(df['preprocessed_text'])

# tfidf_matrix is a sparse matrix representation of documents.


### "tfidf_matrix" is what we will use for Topic Modeling and be served as predictors for the topic classification

### To better understand the "tfidf_matrix" 

In [12]:
# view as dense matrix
dense_tfidf = tfidf_matrix[:5].todense()
pd.DataFrame(dense_tfidf, columns=tfidf_vect.get_feature_names_out())


Unnamed: 0,abortion,abuse,academy,accident,accord,account,accuse,across,act,action,...,writer,writing,year,yearold,yet,york,young,yourfeedscience,yulia,zelensky
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.230155,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.40839,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# display the shape of the matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

Shape of tfidf_matrix: (1000, 1000)


In [14]:
# print some non-0 TF-IDF values of the first document
doc = 0  
feature_names = tfidf_vect.get_feature_names_out()
print(f"Non-zero TF-IDF values in document {doc}:")
for word, tfidf_value in zip(feature_names, tfidf_matrix[doc].toarray().flatten()):
    if tfidf_value > 0:
        print(f"{word}: {tfidf_value}")

Non-zero TF-IDF values in document 0:
appear: 0.33369204358556975
article: 0.3843633401735585
day: 0.1390727267593142
include: 0.1643407977009323
michael: 0.21776256267191377
past: 0.36720255224701315
real: 0.17335980805256246
think: 0.18626823639743526
use: 0.15368638573345986
word: 0.609070472184471
year: 0.23015483787537938


### Other Features

How to use other features for the topic modeling task or the topic classification task, depending on the nature of these features and the goals of our analysis. 

We will get back to "other features" when we do our topic modeling task. 

We intend to use other features as predictors for our topic classification task. But we need to wait until we obtain the topic labels from our topic modeling results. As we will need to evaluate  which "other features" have a meaningful relationship with the topic labels. we may only choose one of them, or combination of few of them based on the consideration of the model complexity and interpretability.

In [15]:
#save the dataset with the preprocessed text
df.to_csv(OUTPUT, index=False)

In [16]:
from scipy import sparse

# save tfidf_matrix
sparse.save_npz(TFIDF_MAT, tfidf_matrix)
sparse.save_npz(BOW_MAT, bow_matrix)

# Pickling the vectorizer to use in different notebooks

To analyze the different topics visually - we can print the words associated to a topic. To do this, we need access to the vectorizer instance across different jupyter notebooks.

In [17]:
import pickle

with open(COUNT_VEC, 'wb') as f:
    pickle.dump(vectorizer, f)

In [18]:

with open(TFIDF_VEC, 'wb') as f:
    pickle.dump(tfidf_vect, f)