In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('processced df.csv')
df.head()

Unnamed: 0,type_of_material,news_desk,section_name,combined_text,author
0,News,Lifestyle & Leisure,Home & Garden,"Point of view — prescriptive, descriptive and ...",Penelope Green
1,News,Lifestyle & Leisure,Home & Garden,Ten high-end design galleries will offer 30 to...,Marianne Rohrlich
2,News,Business,Business Day,Bankers may be suffering but the decline in de...,Lauren Laughlin
3,News,National,U.S.,Florida scaled back its purchase of sugar comp...,Damien Cave
4,News,Business,Technology,"Take-Two Interactive, the maker of Grand Theft...",Bloomberg News


### Text Preprocessing Function: Combined_text

In [3]:
# load the spaCy model
nlp = spacy.load('en_core_web_sm')
# preprocessing function
def preprocess_text(text):
    # lowercase text
    text = text.lower()
    # remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize and lemmatize the text
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english')]
    # join the tokens back into a string
    return ' '.join(tokens)

# apply this function to 'combined_text' 
df['preprocessed_text'] = df['combined_text'].apply(preprocess_text)

df = df.drop('combined_text', axis=1)
# Now the 'preprocessed_text' column will have the cleaned and preprocessed text.


In [4]:
df.head()

Unnamed: 0,type_of_material,news_desk,section_name,author,preprocessed_text
0,News,Lifestyle & Leisure,Home & Garden,Penelope Green,point view prescriptive descriptive definite...
1,News,Lifestyle & Leisure,Home & Garden,Marianne Rohrlich,ten highend design gallery offer percent m...
2,News,Business,Business Day,Lauren Laughlin,banker may suffer decline dealmaking also take...
3,News,National,U.S.,Damien Cave,florida scale back purchase sugar company land...
4,News,Business,Technology,Bloomberg News,taketwo interactive maker grand theft auto acc...


In [5]:
print(df.loc[4, "preprocessed_text"])

taketwo interactive maker grand theft auto accuse backdate option historically low price scheme reward key employee taketwo interactive software maker grand theft auto video game agree pay   million settle lawsuit security exchange commission accuse company backdate stock option video game maker pay   million settle stock option case suit litigation taketwo interactive software inc stock option purchase plan security commodity violation


### Vectorization: Using BoW to Transform Preprocessed Text Data for LDA

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
#create a countvectorizer instance
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
#apply it to preprocessed text
bow_matrix = vectorizer.fit_transform(df['preprocessed_text'])

### Vectorization: Using TF-IDF to Transform Preprocessed Text Data for NMF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate the TF-IDF 
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
tfidf_matrix = tfidf_vect.fit_transform(df['preprocessed_text'])

# tfidf_matrix is a sparse matrix representation of documents.


### "tfidf_matrix" is what we will use for Topic Modeling and be served as predictors for the topic classification

### To better understand the "tfidf_matrix" 

In [8]:
# view as dense matrix
dense_tfidf = tfidf_matrix[:5].todense()
pd.DataFrame(dense_tfidf, columns=tfidf_vect.get_feature_names_out())


Unnamed: 0,abortion,abuse,accident,accord,accuse,across,act,action,actor,actress,...,write,writer,writing,yankee,yankees,year,yearold,yet,york,young
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.159553,0.0,0.0
4,0.0,0.0,0.0,0.0,0.314085,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# display the shape of the matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

Shape of tfidf_matrix: (7064, 1000)


In [10]:
# print some non-0 TF-IDF values of the first document
doc = 0  
feature_names = tfidf_vect.get_feature_names_out()
print(f"Non-zero TF-IDF values in document {doc}:")
for word, tfidf_value in zip(feature_names, tfidf_matrix[doc].toarray().flatten()):
    if tfidf_value > 0:
        print(f"{word}: {tfidf_value}")

Non-zero TF-IDF values in document 0:
approach: 0.16513633530413127
book: 0.5886450470756803
design: 0.15932135888761217
elizabeth: 0.3249025991244571
fill: 0.16245129956222856
film: 0.14515339432689328
first: 0.20053530838247816
like: 0.21363509583790666
literature: 0.12646568649139922
page: 0.32564066974573225
point: 0.27680361188442515
room: 0.15589582412759506
title: 0.15648998314298898
view: 0.3083629708335429
watch: 0.13939766322972955


### Other Features

How to use other features for the topic modeling task or the topic classification task, depending on the nature of these features and the goals of our analysis. 

We will get back to "other features" when we do our topic modeling task. 

We intend to use other features as predictors for our topic classification task. But we need to wait until we obtain the topic labels from our topic modeling results. As we will need to evaluate  which "other features" have a meaningful relationship with the topic labels. we may only choose one of them, or combination of few of them based on the consideration of the model complexity and interpretability.

In [11]:
#save the dataset with the preprocessed text
df.to_csv('processced text.csv', index=False)

In [12]:
from scipy import sparse

# save tfidf_matrix
sparse.save_npz("tfidf_matrix.npz", tfidf_matrix)
sparse.save_npz("bow_matrix.npz", bow_matrix)

# Pickling the vectorizer to use in different notebooks

To analyze the different topics visually - we can print the words associated to a topic. To do this, we need access to the vectorizer instance across different jupyter notebooks.

In [13]:
import pickle

with open('out/count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [14]:

with open('out/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vect, f)