In [None]:
# INPUT = 'processced df.csv'
# OUTPUT = 'processced text.csv'
# TFIDF_MAT="tfidf_matrix.npz"
# BOW_MAT="bow_matrix.npz"
# COUNT_VEC='out/count_vectorizer.pkl'
# TFIDF_VEC="out/tfidf_vectorizer.pkl"
INPUT="../api/processed_articles.csv"
OUTPUT="../api/processed_text.csv"
TFIDF_MAT="../api/tfidf_matrix.npz"
BOW_MAT="../api/bow_matrix.npz"
COUNT_VEC="../api/out/count_vectorizer.pkl"
TFIDF_VEC="../api/out/tfidf_vectorizer.pkl"

In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
nltk.download('punkt')
nltk.download('stopwords')


In [None]:
df = pd.read_csv('processced df.csv')
df.head()

### Text Preprocessing Function: Combined_text

In [None]:
# load the spaCy model
nlp = spacy.load('en_core_web_sm')
# preprocessing function
def preprocess_text(text):
    # lowercase text
    text = text.lower()
    # remove special characters, punctuation, and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # tokenize and lemmatize the text
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.text not in stopwords.words('english')]
    # join the tokens back into a string
    return ' '.join(tokens)

# apply this function to 'combined_text' 
df['preprocessed_text'] = df['combined_text'].apply(preprocess_text)

df = df.drop('combined_text', axis=1)
# Now the 'preprocessed_text' column will have the cleaned and preprocessed text.


In [None]:
df.head()

In [None]:
print(df.loc[4, "preprocessed_text"])

### Vectorization: Using BoW to Transform Preprocessed Text Data for LDA

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
#create a countvectorizer instance
vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
#apply it to preprocessed text
bow_matrix = vectorizer.fit_transform(df['preprocessed_text'])

### Vectorization: Using TF-IDF to Transform Preprocessed Text Data for NMF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# instantiate the TF-IDF 
tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
tfidf_matrix = tfidf_vect.fit_transform(df['preprocessed_text'])

# tfidf_matrix is a sparse matrix representation of documents.


### "tfidf_matrix" is what we will use for Topic Modeling and be served as predictors for the topic classification

### To better understand the "tfidf_matrix" 

In [None]:
# view as dense matrix
dense_tfidf = tfidf_matrix[:5].todense()
pd.DataFrame(dense_tfidf, columns=tfidf_vect.get_feature_names_out())


In [None]:
# display the shape of the matrix
print("Shape of tfidf_matrix:", tfidf_matrix.shape)

In [None]:
# print some non-0 TF-IDF values of the first document
doc = 0  
feature_names = tfidf_vect.get_feature_names_out()
print(f"Non-zero TF-IDF values in document {doc}:")
for word, tfidf_value in zip(feature_names, tfidf_matrix[doc].toarray().flatten()):
    if tfidf_value > 0:
        print(f"{word}: {tfidf_value}")

### Other Features

How to use other features for the topic modeling task or the topic classification task, depending on the nature of these features and the goals of our analysis. 

We will get back to "other features" when we do our topic modeling task. 

We intend to use other features as predictors for our topic classification task. But we need to wait until we obtain the topic labels from our topic modeling results. As we will need to evaluate  which "other features" have a meaningful relationship with the topic labels. we may only choose one of them, or combination of few of them based on the consideration of the model complexity and interpretability.

In [None]:
#save the dataset with the preprocessed text
df.to_csv(OUTPUT, index=False)

In [None]:
from scipy import sparse

# save tfidf_matrix
sparse.save_npz(TFIDF_MAT, tfidf_matrix)
sparse.save_npz(BOW_MAT, bow_matrix)

# Pickling the vectorizer to use in different notebooks

To analyze the different topics visually - we can print the words associated to a topic. To do this, we need access to the vectorizer instance across different jupyter notebooks.

In [None]:
import pickle

with open(COUNT_VEC, 'wb') as f:
    pickle.dump(vectorizer, f)

In [None]:

with open(TFIDF_VEC, 'wb') as f:
    pickle.dump(tfidf_vect, f)