In [1]:
import numpy as np
import scipy.sparse as sp
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
from scipy.stats import chi2_contingency
from gensim import corpora, models
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora.dictionary import Dictionary
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import spacy
import re
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\qfu88\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df = pd.read_csv('processced text.csv')
df.head()

Unnamed: 0,type_of_material,news_desk,section_name,author,preprocessed_text
0,News,Lifestyle & Leisure,Home & Garden,Penelope Green,point view prescriptive descriptive definite...
1,News,Lifestyle & Leisure,Home & Garden,Marianne Rohrlich,ten highend design gallery offer percent m...
2,News,Business,Business Day,Lauren Laughlin,banker may suffer decline dealmaking also take...
3,News,National,U.S.,Damien Cave,florida scale back purchase sugar company land...
4,News,Business,Technology,Bloomberg News,taketwo interactive maker grand theft auto acc...


In [3]:
print(df.loc[4, "preprocessed_text"])

taketwo interactive maker grand theft auto accuse backdate option historically low price scheme reward key employee taketwo interactive software maker grand theft auto video game agree pay   million settle lawsuit security exchange commission accuse company backdate stock option video game maker pay   million settle stock option case suit litigation taketwo interactive software inc stock option purchase plan security commodity violation


### "Best Model" used on the newly preprocessed data

In [4]:
#remove common words
#list of common words to remove
common_words_to_remove = ["new", "york", "year", "city"]

# define a function to remove those words from "preprocessed_text" feature
def remove_common_words(text, words_to_remove):
    tokens = text.split()  
    filtered_tokens = [token for token in tokens if token not in words_to_remove]
    return ' '.join(filtered_tokens)  

df['preprocessed_text'] = df['preprocessed_text'].apply(lambda text: remove_common_words(text, common_words_to_remove))


In [5]:
#vectorized the newly preprocessed data

tfidf_vect = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000)

# apply it to preprocessed text
tfidf_matrix = tfidf_vect.fit_transform(df['preprocessed_text'])

In [6]:

df['preprocessed_text'] = df['preprocessed_text'].apply(lambda x: x.split())
texts = df['preprocessed_text'].tolist()
# create a gensim dictionary from the texts
dictionary = corpora.Dictionary(texts)

In [7]:
feature_names = tfidf_vect.get_feature_names_out()

In [8]:
#Set random state
random_state = 696

In [9]:
with open('out/best_model.pkl', 'rb') as g:
    best_model = pickle.load(g)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [10]:
W = best_model.fit_transform(tfidf_matrix)
H = best_model.components_

AttributeError: 'NMF' object has no attribute 'alpha'

In [None]:
#calculate the coherence score.
num_top_words = 50
topics = []

for topic_idx, topic in enumerate(H):
    top_features_ind = topic.argsort()[-num_top_words:][::-1]
    top_features = [feature_names[i] for i in top_features_ind]
    topics.append(top_features)


coherence_model = CoherenceModel(topics=topics, texts=texts, dictionary=dictionary, coherence='c_v')
coherence_score = coherence_model.get_coherence()

In [None]:
coherence_score

In [None]:
#get the top 50 words of each topic
num_top_words = 100

for topic_idx, topic in enumerate(H):
    print(f"Topic {topic_idx + 1}:")
    top_features_ind = topic.argsort()[-num_top_words:][::-1]  
    top_features = [feature_names[i] for i in top_features_ind]
    top_features_str = ", ".join(top_features)
    print(top_features_str)

### Descriptive label for each topic and article-topic distribution

Topic 1: Cultural, Social, and Urban Life

Topic 2: Personal Events and Ceremonies

Topic 3: Politics, Governance, and International Affairs

In [None]:
# Determine the dominant topic for each article
dominant_topic = np.argmax(W, axis=1)

# Count the number of articles associated with each topic
topic_counts = np.bincount(dominant_topic)

plt.figure(figsize=(4, 4))
plt.pie(topic_counts, labels=['Topic 1', 'Topic 2', 'Topic 3'], autopct='%1.1f%%', startangle=140, colors=['lightblue', 'lightgreen', 'lightcoral'])
plt.title('Proportion of Articles by Dominant Topic')
plt.show()


### Assigning the topic label to each article

In [None]:

# add topic label column
df['topic_label'] = dominant_topic

topic_names = {0: 'Lifestyle', 1: 'Events', 2: 'Politics'}

df['topic_label'] = df['topic_label'].map(topic_names)


In [None]:
df.head()

## Examine the predictors

### Correlation Analysis
perform a chi-square test of independence to see if there is a significant association between each of these predictors and the topic_label.

The Chi-square test will help us understand whether the distribution of topic labels is independent of these categorical variables or if there's a significant association between them.

In [None]:


# Convert to num
df['type_of_material_num'] = df['type_of_material'].astype('category').cat.codes
df['news_desk_num'] = df['news_desk'].astype('category').cat.codes
df['section_name_num'] = df['section_name'].astype('category').cat.codes
df['author_num'] = df['author'].astype('category').cat.codes

# perform chi-square test for each feature against 'topic_label'
features = ['type_of_material_num', 'news_desk_num', 'section_name_num', 'author_num']
for feature in features:
    contingency_table = pd.crosstab(df[feature], df['topic_label'])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    print(f"Chi-square test for {feature} vs topic_label:")
    print(f"Chi-square statistic: {chi2}, p-value: {p}\n")


### Interpreting the result

all four features (type_of_material, news_desk, section_name, author) show a statistically significant association with the topic_label. This means these features could potentially be useful predictors in our topic classification task. Their inclusion could improve the model's ability to accurately predict the topic of an article.

### To avoid high dimentionality in the models

In [None]:
print("Value Counts for 'type_of_material':")
print(df['type_of_material'].value_counts())

print("\nValue Counts for 'news_desk':")
print(df['news_desk'].value_counts())

print("\nValue Counts for 'section_name':")
print(df['section_name'].value_counts())

print("\nValue Counts for 'author':")
print(df['author'].value_counts())


### Dimensionality Reduction Strategy:

Based on the distribution of each feature, identifying the top n categories for each feature based on their frequency and then grouping the less frequent categories into an "Other" category. 

Top 20 authors
Top 10 news desks
Top 10 section names



In [None]:

top_authors = df['author'].value_counts().nlargest(20).index
top_news_desks = df['news_desk'].value_counts().nlargest(10).index
top_section_names = df['section_name'].value_counts().nlargest(10).index

# group other categories into 'Other'
df['author_reduced'] = df['author'].apply(lambda x: 'Other' if x not in top_authors else x)
df['news_desk_reduced'] = df['news_desk'].apply(lambda x: 'Other' if x not in top_news_desks else x)
df['section_name_reduced'] = df['section_name'].apply(lambda x: 'Other' if x not in top_section_names else x)


In [None]:
df.head()

In [None]:
# drop no longer needed columns
df_model = df.drop(['news_desk', 'section_name', 'author', 
                    'type_of_material_num', 'news_desk_num', 'section_name_num', 'author_num'], axis=1)



In [None]:
df_model.head()

## Getting df ready for ML

In [None]:
# one-hot encoding
df_encoded = pd.get_dummies(df_model, columns=['type_of_material', 'author_reduced', 'news_desk_reduced', 'section_name_reduced'])


In [None]:
# Vectorize preprocessed_text

# convert list of words in 'preprocessed_text' to a string
df_encoded['preprocessed_text'] = df_encoded['preprocessed_text'].apply(lambda x: ' '.join(x))


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=1000) 

# Fit and transform 'preprocessed_text' to obtain TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(df_encoded['preprocessed_text'])


In [None]:
# combine tfid_matrix with encoded features

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# drop 'preprocessed_text' 
df_encoded.drop('preprocessed_text', axis=1, inplace=True)


df_final = pd.concat([df_encoded.reset_index(drop=True), tfidf_df.reset_index(drop=True)], axis=1)


#### Now, the df_final is the one used for topic classification models, containing both the vectorized preprocessed_text predictor and one-hot encoded other predictors. 

In [None]:
df_final.head()

In [None]:
with open('out/df_final.pkl', 'wb') as f:
    pickle.dump(df_final, f)

In [None]:
with open('out/dominant_topic_labels.pkl', 'wb') as f:
    pickle.dump(dominant_topic, f)