# Importing Libraries

In [5]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import normalize
from sklearn.decomposition import NMF
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import gensim
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

# Importing data

In [6]:
content_wehco = pd.read_csv("C:/Users/adity/Downloads/SRC Project/LDA/content_lvl2_0802.csv")

# Clean headline column

In [7]:
content_wehco["headline"] = content_wehco["headline"].astype('str')
content_wehco["headline"] = content_wehco["headline"].str.encode("ascii", "ignore").str.decode('ascii')

# Creating bigrams / trigrams

In [8]:
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('central arkansas','centralarkansas')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('little rock','littlerock')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('donald trump','donaldtrump')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('covid 19','coronavirus')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('covid-19','coronavirus')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('covid','coronavirus')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('dear abby','dearabby')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('pages from the past','pagesfromthepast')

content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('new york','ny')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('syracuse university','su')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('president trump','trump')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('donald trump','trump')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('new year','new year')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("trump's",'trump')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('president obama','obama')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('dear abby','dearabby')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('pm buzz','pmbuzz')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('viral video','viralvideo')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('star wars','starwars')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('white house','whitehouse')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('high school','highschool')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("women's",'woman')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("men's",'men')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("buffalo-bill",'buffalobill')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("boeheims",'boeheim')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("boeheim's",'boeheim')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("jim boeheim",'boeheim')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("information",'info')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace("notre dame",'notredame')
content_wehco['headline'] = content_wehco['headline'].str.lower().str.replace('syracuse','')

# Create new column for level 1 tags

In [9]:
content_wehco["nlp_level_1"] = "Others"

matches = ["BRIDGE", "CROSSWORD", "DOKU", "SLEUTH", "DOKO", "JUM", "TOSHIKI", "TORI"]

for i in range(0, len(content_wehco)):
    if (re.search("Obituaries", str(content_wehco.headline[i]), re.IGNORECASE)):
        content_wehco["nlp_level_1"][i] = "Obituaries"
    elif (re.search("Obituary", str(content_wehco.headline[i]), re.IGNORECASE)):
        content_wehco["nlp_level_1"][i] = "Obituaries"
    elif ("Opinion" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Opinion"
    elif ("Weather" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Weather"
    elif ("Promotions" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Promotions"
    elif any(x in str(content_wehco.headline[i]).upper() for x in matches):
        content_wehco["nlp_level_1"][i] = "Games"
    elif ("Coronavirus" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Coronavirus"
    elif ("Business" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Business"
    elif ("Editorial" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Editorial"
    elif ("Crime" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Crime"
    elif ("Entertainment" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Entertainment"
    elif ("Sports" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "Sports"
    elif ("News" in str(content_wehco.cmsCategories[i])):
        content_wehco["nlp_level_1"][i] = "News"

# Use ML to classify "others" as level 1 tag

# Stemming & Tokenizing

In [11]:
extra_stop_words = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', "%", 'm', 'p', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', "about", "across", "after", "all", 
 "also", "an", "and", "another", "added", "any", "are", "as", "at", "basically", "be", "because", 'become', "been", "before", 
 "being", "between", "both", "but", "by", "came", "can", "come", "could", "did", "do", "does", "each", "else", "every",
 "either", "especially", "for", "from", "get", "given", "gets", 'give', 'gives', "got", "goes", "had", "has", "have", "he", 
 "her", "here", "him", "himself", "his", "how", "if", "in", "into", "is", "it", "its", "just", "lands", "like", "make", 
 "making", "made", "many", "may", "me", "might", "more", "most", "much", "must", "my", "never", "provide", "provides", 
 "perhaps", "no", "now", "of", "on", "only", "or", "other", "our", "out", "over", "re", "said", "same", "see", "should", 
 "since", "so", "some", "still", "such", "seeing", "see", "take", "than", "that", "the", "their", "them", "then", "there", 
 "these", "they", "this", "those", "through", "to", "too", "under", "up", "use", "using", "used", "underway", "very", "want", 
 "was", "way", "we", "well", "were", "what", "when", "where", "which", "while", "whilst", "who", "will", "with", "would", 
 "you", "your", 'etc', 'via', 'eg', 'news', "'s", "april", "march", "june", "july", "august", "february", "january", "december", 
 "september", "october", "dec", "oct", "nov", "today's", "'the'", "sept", "feb", "jan", "amp", "say", "'the", "friday's", "friday",
 "saturday", "saturday's", "sunday", "sunday's"]
stop_words = text.ENGLISH_STOP_WORDS.union(extra_stop_words)

content_wehco['headline'] = content_wehco['headline'].str.replace('obituaries','obits')
content_wehco['headline'] = content_wehco['headline'].str.replace('obituary','obits')

stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')
wn = WordNetLemmatizer()

def tokenize(text):
    return [wn.lemmatize(word) for word in tokenizer.tokenize(text.lower())]

# Creating subset without "others"

In [14]:
content_subset_without_others = content_wehco[content_wehco["nlp_level_1"] != "Others"]
content_wehco["pred"] = content_wehco["nlp_level_1"]
content_wehco["conf"] = 0
content_wehco["pred"].value_counts()

Others           50798
News             49448
Sports           24360
Entertainment    18907
Crime            10658
Business          9241
Coronavirus       7355
Editorial         7248
Obituaries        6256
Games             5965
Promotions        4160
Opinion           1606
Weather            928
Name: pred, dtype: int64

In [15]:
content_subset_without_others["nlp_level_1"].value_counts()

News             49448
Sports           24360
Entertainment    18907
Crime            10658
Business          9241
Coronavirus       7355
Editorial         7248
Obituaries        6256
Games             5965
Promotions        4160
Opinion           1606
Weather            928
Name: nlp_level_1, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(content_subset_without_others['headline'], content_subset_without_others['nlp_level_1'], random_state = 0)
count_vect = CountVectorizer(min_df = 30, stop_words = stop_words, tokenizer = tokenize, analyzer = 'word', max_features = 50000)
X_train_counts = count_vect.fit_transform(X_train)
tfidf = TfidfTransformer(sublinear_tf = True, norm = 'l2', smooth_idf = False)
X_train_tfidf = tfidf.fit_transform(X_train_counts)
clf = CalibratedClassifierCV(LinearSVC()).fit(X_train_tfidf, y_train)

In [17]:
content_wehco["pred"] = clf.predict(count_vect.transform(content_wehco["headline"]))
content_wehco["conf"] = (clf.predict_proba(count_vect.transform(content_wehco["headline"]))).max(axis = 1)
content_wehco["pred"].value_counts()

News             72781
Sports           33871
Entertainment    25110
Crime            19092
Business         10791
Coronavirus       8923
Obituaries        6261
Promotions        6211
Games             5750
Editorial         5528
Weather           1679
Opinion            933
Name: pred, dtype: int64

In [18]:
content_wehco["final_pred"] = np.where((content_wehco["nlp_level_1"] == "Others") & (content_wehco["conf"] >= 0.6), content_wehco["pred"], content_wehco["nlp_level_1"])
content_wehco['counts'] = content_wehco['final_pred'].map(content_wehco['final_pred'].value_counts())
content_wehco.to_csv("content_wehco.csv")

In [20]:
content_wehco["final_pred"].value_counts()

News             57536
Sports           29661
Others           27243
Entertainment    23549
Crime            11749
Business         10430
Coronavirus       9353
Editorial         7522
Obituaries        6257
Games             5975
Promotions        4944
Opinion           1674
Weather           1037
Name: final_pred, dtype: int64

# Subsetting each of the tags

In [21]:
content_subset_obit = content_wehco[content_wehco["final_pred"] == "Obituaries"]
content_subset_weather = content_wehco[content_wehco["final_pred"] == "Weather"]
content_subset_games = content_wehco[content_wehco["final_pred"] == "Games"] # political
content_subset_business = content_wehco[content_wehco["final_pred"] == "Business"]
content_subset_promotions = content_wehco[content_wehco["final_pred"] == "Promotions"] # landc
content_subset_opinion = content_wehco[content_wehco["final_pred"] == "Opinion"]
content_subset_crime = content_wehco[content_wehco["final_pred"] == "Crime"]
content_subset_editorial = content_wehco[content_wehco["final_pred"] == "Editorial"] # local news
content_subset_entertainment = content_wehco[content_wehco["final_pred"] == "Entertainment"]
content_subset_sports = content_wehco[content_wehco["final_pred"] == "Sports"]
content_subset_news = content_wehco[content_wehco["final_pred"] == "News"]
content_subset_others = content_wehco[content_wehco["final_pred"] == "Others"]
content_subset_covid = content_wehco[content_wehco["final_pred"] == "Coronavirus"]

# Methods to try
1. K Means
2. Guided LDA
3. LDA
4. NMF

# K Means

## Obits

In [23]:
content_subset_headline = content_subset_obit[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 3
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : obit, lee, william, jr, robert, jean, john, charles, david
Cluster 1 : james, obit, jr, edward, sr, lee, thomas, david, williams
Cluster 2 : ann, obit, patricia, mary, shirley, carolyn, betty, margaret, barbara


## Weather

In [24]:
content_subset_headline = content_subset_weather[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 4
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : arkansas, state, flooding, rain, flood, snow, river, wind, levee
Cluster 1 : weather, severe, arkansas, risk, forecaster, possible, state, storm, winter
Cluster 2 : tornado, arkansas, storm, damage, state, hit, weather, strong, wind
Cluster 3 : storm, power, south, state, arkansas, hit, wind, severe, county


## Games

In [25]:
content_subset_headline = content_subset_games[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 5
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : sudoku, killer, conceptis, pro, littlerock, crossword, commuter, historic, kid
Cluster 1 : jumble, daily, crossword, kid, commuter, tv, puzzle, sudoku, year
Cluster 2 : crossword, universal, premier, tv, observer, puzzle, year, jump, jumble
Cluster 3 : editorial, page, guide, new, time, kid, arkansas, year, set
Cluster 4 : bridge, time, la, historic, crossword, futoshiki, hitori, wordsleuth, ace


## Business

In [26]:
content_subset_headline = content_subset_business[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 4
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : stock, bankruptcy, watch, arkansas, index, real, estate, building, transaction
Cluster 1 : business, people, northwest, arkansas, nw, company, report, firm, walmart
Cluster 2 : brief, business, northwest, arkansas, open, year, fed, farm, farmer
Cluster 3 : pay, retailer, settle, walmart, company, lawsuit, store, bank, apple


## Promotions

In [27]:
content_subset_headline = content_subset_promotions[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 5
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : new, cabot, searcy, year, community, area, county, spring, arkansas
Cluster 1 : conway, ann, man, woman, artist, school, new, project, work
Cluster 2 : center, senior, community, white, director, work, child, bryant, present
Cluster 3 : russellville, new, young, mayor, park, city, year, chief, highschool
Cluster 4 : coach, state, title, win, football, new, lead, team, longtime


## Opinion

In [28]:
content_subset_headline = content_subset_opinion[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 5
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : masterson, online, mike, read, opinion, letter, writer, rex, nwa
Cluster 1 : rex, nelson, john, nwa, editorial, brummett, guest, writer, opinion
Cluster 2 : opinion, editorial, writer, rex, read, online, nwa, nelson, mike
Cluster 3 : letter, nwa, writer, rex, read, opinion, online, nelson, mike
Cluster 4 : brummett, online, read, john, opinion, writer, rex, nwa, nelson


## Crime

In [29]:
content_subset_headline = content_subset_crime[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 7
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : littlerock, police, man, north, shot, shooting, woman, home, robbed
Cluster 1 : arkansas, man, police, shooting, suspect, burglary, charge, accused, case
Cluster 2 : arrested, man, arkansas, police, littlerock, shooting, suspect, charge, woman
Cluster 3 : beat, police, man, arkansan, sentenced, threat, lead, assault, outside
Cluster 4 : guilty, pleads, arkansas, man, innocent, ex, case, year, woman
Cluster 5 : year, old, man, arkansas, littlerock, term, girl, sentenced, prison
Cluster 6 : charged, man, arkansas, murder, woman, shooting, death, slaying, killing


## Entertainment

In [30]:
content_subset_headline = content_subset_entertainment[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 6
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : dearabby, arkansas, home, talk, calendar, new, restaurant, space, movie
Cluster 1 : horoscope, holiday, opinion, flavor, foundation, forward, food, folk, focus
Cluster 2 : recipe, cake, alley, idea, food, favorite, dish, chicken, flavor
Cluster 3 : wedding, plan, dearabby, william, dr, thomas, elizabeth, anniversary, new
Cluster 4 : hint, helpful, opinion, gizmo, free, foundation, forward, food, folk
Cluster 5 : super, quiz, word, title, literature, science, film, letter, country


## Editorial

In [31]:
content_subset_headline = content_subset_editorial[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 7
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : cartoon, deering, john, trump, impeachment, state, democrat, new, don't
Cluster 1 : burner, best, martin, seller, opinion, time, good, life, philip
Cluster 2 : nelson, rex, arkansas, opinion, state, big, time, long, holiday
Cluster 3 : columnist, impeachment, trump, bernie, democrat, new, day, save, wa
Cluster 4 : letter, editor, opinion, john, year, ha, democrat, don't, free
Cluster 5 : masterson, mike, opinion, change, good, matter, thing, bad, don't
Cluster 6 : brummett, john, it's, opinion, new, joe, question, man, matter


## Sports

In [32]:
content_subset_headline = content_subset_sports[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 8
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : sport, brief, state, roundup, arkansas, outdoors, honor, tech, earn
Cluster 1 : hog, ex, sec, recruiting, texas, visit, offer, commits, future
Cluster 2 : ua, game, win, razorback, coach, state, team, hall, title
Cluster 3 : arkansas, postcard, past, live, visit, state, texas, game, recruiting
Cluster 4 : post, bass, loss, ua, score, favorite, bad, looking, defense
Cluster 5 : football, highschool, prep, basketball, roundup, schedule, coach, college, week
Cluster 6 : season, end, start, hog, arkansas, open, nfl, final, opener
Cluster 7 : wire, green, finish, late, victory, win, yurachek, ground, greenwood


## News

In [33]:
content_subset_headline = content_subset_news[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 8
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : trump, county, state, littlerock, new, school, notebook, plan, judge
Cluster 1 : ua, notebook, gift, student, trustee, campus, scholarship, tuition, aid
Cluster 2 : brief, nation, world, washington, state, capitol, file, law, legislative
Cluster 3 : arkansas, northwest, court, roundup, supreme, arrest, appeal, state, achiever
Cluster 4 : police, man, crash, killed, littlerock, arkansas, dy, officer, year
Cluster 5 : day, face, closing, memorial, veteran, labor, long, election, year
Cluster 6 : lr, city, board, school, littlerock, spa, plan, district, ok
Cluster 7 : record, daily, filing, crawford, county, set, jefferson, marriage, meeting


## Others

In [34]:
content_subset_headline = content_subset_others[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 8
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : state, new, littlerock, arkansas, best, online, year, set, case
Cluster 1 : arkansas, week, state, case, county, test, south, economic, men
Cluster 2 : tv, talk, littlerock, set, leave, howard, idea, watch, people
Cluster 3 : weather, road, draw, watch, travel, arkansas, threat, california, step
Cluster 4 : benton, ann, james, robert, faye, elizabeth, mary, sr, william
Cluster 5 : bluff, pine, mr, johnson, sr, jr, ann, lee, george
Cluster 6 : home, mountain, stay, week's, expensive, sold, littlerock, cost, state
Cluster 7 : service, memorial, helena, arkansas, la, fort, firm, library, department


## Coronavirus

In [35]:
content_subset_headline = content_subset_covid[['headline']]
document = list(content_subset_headline["headline"])
vectorizer = TfidfVectorizer(sublinear_tf = True, min_df = 30, norm = 'l2', stop_words = stop_words, tokenizer = tokenize)
X = vectorizer.fit_transform(document)
true_k = 5
model = KMeans(n_clusters = true_k, init = 'k-means++', max_iter = 500, n_init = 20)
model.fit(X)
terms = vectorizer.get_feature_names()
words = model.cluster_centers_.argsort()[:,-1:-10:-1]
for num, centroid in enumerate(words):
    print('Cluster ' + str(num) + ' : ' + ', '.join(terms[word] for word in centroid))

Cluster 0 : brief, nation, world, washington, state, sport, county, governor, trump
Cluster 1 : state, case, coronavirus, site, screening, offer, virus, death, rise
Cluster 2 : coronavirus, arkansas, test, positive, case, la, update, dy, patient
Cluster 3 : face, pandemic, trump, arkansas, aid, reopening, stock, plan, mask
Cluster 4 : virus, case, death, trump, hit, arkansas, offer, spread, aid


# Guided LDA

In [36]:
# simplify Penn tags to n (NOUN), v (VERB), a (ADJECTIVE) or r (ADVERB)
def simplify(penn_tag):
    pre = penn_tag[0]
    if (pre == 'J'):
        return 'a'
    elif (pre == 'R'):
        return 'r'
    elif (pre == 'V'):
        return 'v'
    else:
        return 'n'
    
def preprocess(text):
    toks = gensim.utils.simple_preprocess(str(text), deacc = True)
    wn = WordNetLemmatizer()
    return [wn.lemmatize(tok, simplify(pos)) for tok, pos in nltk.pos_tag(toks) if tok not in stop_words]

def test_eta_lda(eta, dictionary, ntopics, print_topics = True, print_dist = True):
    np.random.seed(42) # set the random seed for repeatability
    bow = [dictionary.doc2bow(line) for line in corp] # get the bow-format lines with the set dictionary
    with (np.errstate(divide = 'ignore')):  # ignore divide-by-zero warnings
        model = gensim.models.ldamodel.LdaModel(
            corpus = bow, id2word = dictionary, num_topics = ntopics,
            random_state = 42, eta = eta, alpha = 'auto')
    # visuzlize the model term topics
    print('Perplexity: {:.2f}'.format(model.log_perplexity(bow)))
    if print_topics:
        # display the top terms for each topic
        for topic in range(ntopics):
            print('Topic {}: {}'.format(topic, [dictionary[w] for w, p in model.get_topic_terms(topic, topn = 10)]))
#     if print_dist:
#         # display the topic probabilities for each document
#         for line, bag in zip(txt, bow):
#             doc_topics = ['({}, {:.1%})'.format(topic, prob) for topic, prob in model.get_document_topics(bag)]
#             print('{} {}'.format(line, doc_topics))
    return model

def test_eta_nmf(eta, dictionary, train_headlines_sentences, ntopics, print_topics = True, print_dist = True):
    vectorizer = CountVectorizer(analyzer = 'word', max_features = 50000, min_df = 30, stop_words = stop_words, tokenizer = tokenize)
    x_counts = vectorizer.fit_transform(train_headlines_sentences)
    # we set a TfIdf Transformer, and transform the counts with the model
    transformer = TfidfTransformer(smooth_idf = False, sublinear_tf = True, norm = 'l2')
    x_tfidf = transformer.fit_transform(x_counts)
    # normalize the TfIdf values to unit length for each row
    xtfidf_norm = normalize(x_tfidf, norm = 'l1', axis = 1)
    # obtain an NMF model
    model = NMF(n_components = ntopics, init = 'nndsvd');
    # fit the model
    model.fit(xtfidf_norm)
    if print_topics:
        # display the top terms for each topic
        feat_names = vectorizer.get_feature_names()
        word_dict = {}
        for i in range(ntopics):
            # for each topic, obtain the largest values, and add the words they map to into the dictionary.
            words_ids = model.components_[i].argsort()[:-10 - 1:-1]
            words = [feat_names[key] for key in words_ids]
            word_dict['Topic #' + '{:02d}'.format(i + 1)] = words
        print(pd.DataFrame(word_dict))
    return model

def create_eta(priors, etadict, ntopics):
    eta = np.full(shape = (ntopics, len(etadict)), fill_value = 1) # create a (ntopics, nterms) matrix and fill with 1
    for word, topic in priors.items(): # for each word in the list of priors
        keyindex = [index for index,term in etadict.items() if term == word] # look up the word in the dictionary
        if (len(keyindex) > 0): # if it's in the dictionary
            eta[topic, keyindex[0]] = 1e7  # put a large number in there      
    eta = np.divide(eta, eta.sum(axis = 0)) # normalize so that the probabilities sum to 1 over all topics
    return eta

# LDA and NMF

## Obits

In [37]:
txt = list(content_subset_obit["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 3)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 3)

Perplexity: -6.63
Topic 0: ['obit', 'james', 'lee', 'robert', 'jr', 'sr', 'smith', 'thomas', 'brown', 'ann']
Topic 1: ['obit', 'mary', 'wayne', 'ruth', 'elizabeth', 'john', 'ann', 'honor', 'project', 'murder']
Topic 2: ['obit', 'ann', 'william', 'charles', 'jean', 'michael', 'david', 'williams', 'lee', 'sue']
  Topic #01 Topic #02 Topic #03
0      obit     james       lee
1       ann        jr    robert
2   william    edward        jr
3    robert        sr        sr
4      jean    thomas   william
5        jr    robert     david
6   charles     david   richard
7      mary   william    edward
8      john   michael    thomas
9     david  williams     smith


NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=3, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Weather

In [38]:
txt = list(content_subset_weather["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 4)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 4)

Perplexity: -6.40
Topic 0: ['storm', 'arkansas', 'flood', 'state', 'severe', 'littlerock', 'tornado', 'damage', 'forecaster', 'weather']
Topic 1: ['arkansas', 'rain', 'snow', 'state', 'storm', 'flood', 'heavy', 'part', 'weather', 'expect']
Topic 2: ['arkansas', 'storm', 'weather', 'state', 'tornado', 'severe', 'forecaster', 'flood', 'possible', 'hit']
Topic 3: ['storm', 'state', 'tornado', 'arkansas', 'damage', 'severe', 'wind', 'strong', 'risk', 'flood']
  Topic #01 Topic #02   Topic #03   Topic #04
0     storm   tornado       flood    arkansas
1     state     state       state     weather
2       hit    damage       river        snow
3     south       hit       levee       state
4     power      kill         hit       river
5      kill   weather        rain        rain
6    damage      wind  littlerock  forecaster
7    severe    strong         set      severe
8      wind  possible       heavy        risk
9      risk     bring      county    possible


NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=4, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Games

In [39]:
txt = list(content_subset_games["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 5)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 5)

Perplexity: -6.06
Topic 0: ['arkansas', 'time', 'crossword', 'story', 'puzzle', 'historic', 'ny', 'conceptis', 'sudoko', 'littlerock']
Topic 1: ['crossword', 'universal', 'premier', 'tv', 'sudoku', 'trump', 'rhetoric', 'district', 'jumble', 'time']
Topic 2: ['daily', 'crossword', 'sudoku', 'jumble', 'killer', 'bridge', 'crystal', 'commuter', 'hitori', 'pro']
Topic 3: ['editorial', 'jumble', 'bridge', 'sudoku', 'conceptis', 'kid', 'crossword', 'ace', 'tv', 'victory']
Topic 4: ['bridge', 'historic', 'jump', 'new', 'futoshiki', 'state', 'wordsleuth', 'set', 'motorist', 'man']
   Topic #01  Topic #02   Topic #03  Topic #04 Topic #05
0  editorial  crossword      sudoku     jumble    bridge
1       page  universal   conceptis      daily       ace
2      guide    premier      killer         tv   crystal
3        new         tv         pro        kid  arkansas
4       time   observer      sudoko  crossword      lane
5        kid       time  littlerock   commuter     close
6      story         

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Business

In [40]:
txt = list(content_subset_business["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 4)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 4)

Perplexity: -7.97
Topic 0: ['stock', 'rate', 'china', 'state', 'tariff', 'year', 'fall', 'plant', 'virus', 'face']
Topic 1: ['china', 'trump', 'trade', 'deal', 'oil', 'new', 'talk', 'firm', 'set', 'say']
Topic 2: ['business', 'brief', 'facebook', 'bid', 'china', 'eu', 'struggle', 'tariff', 'people', 'heat']
Topic 3: ['stock', 'rise', 'sale', 'trump', 'trade', 'huawei', 'bankruptcy', 'record', 'home', 'china']
   Topic #01  Topic #02 Topic #03   Topic #04
0      brief   business     index       watch
1   business     people  arkansas  bankruptcy
2       open      award     stock        file
3  apartment  northwest       end     concern
4   official   arkansas      fall   developer
5      trade      small     state    arkansas
6  developer      state      gain       judge
7       file         nw     close     walmart
8    airport       care      drop      report
9    million    company       add         cut


NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=4, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Promotions

In [41]:
txt = list(content_subset_promotions["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 5)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 5)

Perplexity: -7.99
Topic 0: ['year', 'honor', 'artist', 'new', 'school', 'state', 'county', 'work', 'conway', 'arkansas']
Topic 1: ['new', 'director', 'conway', 'award', 'win', 'county', 'coach', 'miss', 'contestant', 'mayor']
Topic 2: ['center', 'area', 'arkansas', 'event', 'river', 'county', 'community', 'gardener', 'plant', 'honor']
Topic 3: ['center', 'new', 'conway', 'coach', 'benton', 'year', 'searcy', 'home', 'award', 'play']
Topic 4: ['family', 'honor', 'farm', 'county', 'arkansas', 'spring', 'operation', 'center', 'russellville', 'hot']
  Topic #01  Topic #02 Topic #03       Topic #04 Topic #05
0    conway     center     cabot    russellville    searcy
1       new     senior       new             new      year
2       ann   director       ann           young       new
3       man  community   teacher            year     woman
4     woman       open  director            park  business
5    artist        new     woman           mayor      help
6    school      child     coach    

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Opinion

In [42]:
txt = list(content_subset_opinion["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 5)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 5)

Perplexity: -7.14
Topic 0: ['opinion', 'editorial', 'nwa', 'guest', 'writer', 'nelson', 'rex', 'online', 'masterson', 'brummett']
Topic 1: ['online', 'brummett', 'opinion', 'masterson', 'john', 'nwa', 'editorial', 'read', 'mike', 'new']
Topic 2: ['opinion', 'letter', 'brummett', 'editorial', 'john', 'online', 'express', 'paul', 'nelson', 'rex']
Topic 3: ['letter', 'opinion', 'editorial', 'online', 'brummett', 'nelson', 'rex', 'masterson', 'writer', 'life']
Topic 4: ['online', 'masterson', 'rex', 'nelson', 'mike', 'read', 'opinion', 'time', 'arkansas', 'brummett']
   Topic #01  Topic #02 Topic #03  Topic #04  Topic #05
0     letter    opinion  brummett  editorial  masterson
1        nwa       read    online        nwa     online
2     online  editorial      john     writer       mike
3    opinion     writer      read        rex       read
4     writer      guest    writer       read     writer
5        rex       john       rex    opinion        rex
6       read        rex   opinion     

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Crime

In [43]:
txt = list(content_subset_crime["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 7)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 7)

Perplexity: -6.50
Topic 0: ['arkansas', 'charge', 'death', 'murder', 'case', 'face', 'arrest', 'county', 'man', 'teen']
Topic 1: ['littlerock', 'police', 'man', 'arrest', 'woman', 'arkansas', 'year', 'north', 'kill', 'shot']
Topic 2: ['arkansas', 'man', 'apartment', 'state', 'shoot', 'accuse', 'arrest', 'chase', 'driver', 'officer']
Topic 3: ['man', 'year', 'arkansas', 'old', 'guilty', 'girl', 'child', 'sentence', 'plead', 'porn']
Topic 4: ['arkansas', 'man', 'police', 'arrest', 'charge', 'woman', 'northwest', 'shoot', 'sheriff', 'littlerock']
Topic 5: ['police', 'beat', 'suspect', 'shooting', 'death', 'fatal', 'lr', 'homicide', 'man', 'investigate']
Topic 6: ['man', 'littlerock', 'police', 'hot', 'spring', 'sexual', 'arkansas', 'shoot', 'burglary', 'report']
         Topic #01   Topic #02    Topic #03 Topic #04   Topic #05 Topic #06  \
0             beat    burglary   littlerock      year      record    arrest   
1           police      report       police  arkansas       death   susp

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Entertainment

In [44]:
txt = list(content_subset_entertainment["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 6)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 6)

Perplexity: -8.90
Topic 0: ['dearabby', 'help', 'name', 'festival', 'big', 'kid', 'arkansas', 'wife', 'photo', 'bring']
Topic 1: ['garden', 'movie', 'old', 'face', 'home', 'hint', 'helpful', 'state', 'let', 'word']
Topic 2: ['new', 'littlerock', 'review', 'arkansas', 'restaurant', 'music', 'year', 'film', 'high', 'open']
Topic 3: ['arkansan', 'notable', 'dearabby', 'time', 'life', 'scene', 'language', 'watch', 'woman', 'ex']
Topic 4: ['holiday', 'horoscope', 'wedding', 'day', 'dearabby', 'mom', 'history', 'make', 'menu', 'close']
Topic 5: ['talk', 'hunt', 'treasure', 'play', 'note', 'entertainment', 'break', 'work', 'white', 'room']
   Topic #01 Topic #02   Topic #03    Topic #04 Topic #05 Topic #06
0  horoscope      hint       super      wedding  calendar      mind
1    holiday   helpful        quiz      william  religion      open
2    opinion   opinion        word         plan   weekend    garden
3      party       add        film  anniversary     brief      film
4      sweet       

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=6, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Editorial

In [45]:
txt = list(content_subset_editorial["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 7)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 7)

Perplexity: -8.53
Topic 0: ['columnist', 'need', 'time', 'right', 'war', 'don', 'health', 'care', 'stop', 'great']
Topic 1: ['john', 'cartoon', 'deering', 'best', 'rex', 'nelson', 'big', 'seller', 'burner', 'home']
Topic 2: ['life', 'good', 'day', 'let', 'school', 'work', 'state', 'trump', 'holiday', 'choice']
Topic 3: ['letter', 'arkansas', 'columnist', 'year', 'democrat', 'isn', 'road', 'money', 'point', 'tree']
Topic 4: ['guest', 'writer', 'hanson', 'column', 'stephens', 'bret', 'biden', 'gitz', 'know', 'bradley']
Topic 5: ['opinion', 'victor', 'american', 'davis', 'people', 'protect', 'rule', 'debate', 'matter', 'gun']
Topic 6: ['masterson', 'mike', 'martin', 'save', 'philip', 'columnist', 'opinion', 'old', 'watch', 'world']
  Topic #01    Topic #02 Topic #03 Topic #04 Topic #05 Topic #06  Topic #07
0    letter    columnist      john    burner      need    nelson   arkansas
1    editor        trump   cartoon       day    editor       rex        tom
2      vote  impeachment   deerin

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Sports

In [46]:
txt = list(content_subset_sports["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 8)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 8)

Perplexity: -8.48
Topic 0: ['qb', 'game', 'central', 'race', 'year', 'chief', 'finish', 'add', 'change', 'close']
Topic 1: ['game', 'new', 'title', 'run', 'set', 'time', 'big', 'win', 'home', 'lead']
Topic 2: ['arkansas', 'football', 'prep', 'basketball', 'nfl', 'college', 'past', 'littlerock', 'game', 'west']
Topic 3: ['wire', 'victory', 'defense', 'ualr', 'lead', 'northside', 'second', 'team', 'travs', 'ready']
Topic 4: ['state', 'sport', 'brief', 'win', 'playoff', 'live', 'season', 'recruiting', 'end', 'update']
Topic 5: ['hog', 'win', 'lr', 'rout', 'beat', 'cowboy', 'coach', 'league', 'christian', 'leave']
Topic 6: ['day', 'highschool', 'point', 'look', 'card', 'start', 'good', 'cup', 'offer', 'championship']
Topic 7: ['arkansas', 'open', 'guy', 'oaklawn', 'wolf', 'recruit', 'woman', 'visit', 'red', 'cub']
      Topic #01 Topic #02 Topic #03 Topic #04 Topic #05   Topic #06 Topic #07  \
0          wire     sport       hog      post   content          ua      game   
1         green 

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## News

In [47]:
txt = list(content_subset_news["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 8)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 8)

Perplexity: -8.40
Topic 0: ['county', 'trump', 'korea', 'talk', 'governor', 'gop', 'win', 'state', 'race', 'raise']
Topic 1: ['trump', 'senate', 'plan', 'close', 'say', 'divorce', 'work', 'lane', 'eu', 'wall']
Topic 2: ['nation', 'woman', 'day', 'kong', 'hong', 'police', 'abortion', 'troop', 'new', 'kill']
Topic 3: ['littlerock', 'arkansas', 'migrant', 'license', 'charge', 'marriage', 'washington', 'north', 'benton', 'northwest']
Topic 4: ['protest', 'dead', 'record', 'shoot', 'gun', 'man', 'body', 'abuse', 'campaign', 'witness']
Topic 5: ['trump', 'say', 'case', 'set', 'judge', 'iran', 'ex', 'trial', 'biden', 'probe']
Topic 6: ['death', 'school', 'vote', 'panel', 'state', 'aid', 'rise', 'tax', 'report', 'board']
Topic 7: ['brief', 'arkansas', 'face', 'name', 'world', 'court', 'kill', 'crash', 'hit', 'democrat']
  Topic #01 Topic #02 Topic #03   Topic #04  Topic #05   Topic #06  \
0       day    record      face       brief   straight     divorce   
1   closing     daily    charge     

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Others

In [48]:
txt = list(content_subset_others["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 8)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 8)

Perplexity: -10.02
Topic 0: ['littlerock', 'bluff', 'north', 'pine', 'city', 'expensive', 'mr', 'sell', 'head', 'beebe']
Topic 1: ['arkansas', 'new', 'state', 'place', 'case', 'day', 'smith', 'chief', 'death', 'free']
Topic 2: ['week', 'state', 'say', 'time', 'close', 'arkansas', 'court', 'littlerock', 'weather', 'virus']
Topic 3: ['open', 'school', 'year', 'photo', 'restaurant', 'online', 'spring', 'set', 'video', 'work']
Topic 4: ['benton', 'value', 'leave', 'statement', 'core', 'look', 'group', 'line', 'ann', 'library']
Topic 5: ['best', 'bet', 'site', 'new', 'high', 'west', 'good', 'long', 'flight', 'tell']
Topic 6: ['home', 'woman', 'man', 'littlerock', 'cost', 'view', 'cover', 'old', 'center', 'officer']
Topic 7: ['food', 'change', 'report', 'hall', 'life', 'board', 'drive', 'town', 'james', 'add']
    Topic #01   Topic #02  Topic #03 Topic #04 Topic #05   Topic #06  \
0    arkansas     weather     benton     state      best         new   
1        week      travel        ann    

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

## Coronavirus

In [49]:
txt = list(content_subset_covid["headline"])
corp = [preprocess(line) for line in txt]
dictionary = gensim.corpora.Dictionary(corp)
test_eta_lda('auto', dictionary, ntopics = 8)

train_headlines_sentences = [' '.join(text) for text in corp]
test_eta_nmf('auto', dictionary, train_headlines_sentences, ntopics = 8)

Perplexity: -7.76
Topic 0: ['virus', 'plan', 'new', 'trump', 'order', 'pandemic', 'vote', 'tell', 'rule', 'school']
Topic 1: ['stock', 'street', 'wall', 'coronavirus', 'end', 'relief', 'house', 'market', 'loan', 'pandemic']
Topic 2: ['state', 'virus', 'coronavirus', 'case', 'death', 'offer', 'site', 'rise', 'official', 'health']
Topic 3: ['virus', 'reopen', 'state', 'governor', 'push', 'arkansas', 'sale', 'rate', 'high', 'online']
Topic 4: ['coronavirus', 'virus', 'arkansas', 'trump', 'amazon', 'ship', 'firm', 'concern', 'food', 'test']
Topic 5: ['virus', 'world', 'home', 'trump', 'million', 'aid', 'littlerock', 'worker', 'jobless', 'claim']
Topic 6: ['brief', 'virus', 'face', 'nation', 'name', 'test', 'job', 'price', 'day', 'close']
Topic 7: ['aid', 'cut', 'business', 'gain', 'state', 'test', 'small', 'stock', 'rally', 'virus']
  Topic #01    Topic #02 Topic #03    Topic #04 Topic #05  Topic #06  \
0     virus  coronavirus      face        brief  pandemic      state   
1     trump    

NMF(alpha=0.0, beta_loss='frobenius', init='nndsvd', l1_ratio=0.0, max_iter=200,
    n_components=8, random_state=None, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

# Guided LDA

In [None]:
apriori_opposite = {
    'crash':0, 'vehicle':0, 'car':0, 'truck':0, 'unlicensed':0, 'pile':0, 'hit':0, 'driver':0,
    'drugs':3, 'heroin':3, 'fentanyl':3, 'street':3,
    'shooting':4, 'gun':4, 'shooter':4, 'shot':4, 'accidental':4, 'armed':4, 'gunpoint':4, 'shoot':4, 'homicide':4,
    'stabbed':5, 'stealing':5, 'murder':5, 'theft':5, 'victim':5, 'suspect':5, 'fatal':5, 'steal':5, 'stab':5,
    'fire':2, 'blaze':2, 'firefighter':2, 'house':2,
    'rape':1, 'child':1, 'abuse':1, 'pornography':1, 'teen':1, 'woman':1,
}
eta = create_eta(apriori_opposite, dictionary, 6)
test_eta_lda(eta, dictionary, 6)