# Corex
code: https://github.com/gregversteeg/corex_topic/blob/master/corextopic/example/corex_topic_example.ipynb

In [1]:

import numpy as np
import scipy.sparse as ss
import matplotlib.pyplot as plt
import pandas as pd
import re

import corextopic.corextopic as ct
import corextopic.vis_topic as vt # jupyter notebooks will complain matplotlib is being loaded twice

from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

%matplotlib inline

## Import data

In [2]:
#import data
model_data = pd.read_csv('s3://book-data-ucb-capstone-s2022/tokenized_data.csv')
print('Tokenized Text DF Size:', len(model_data))

Tokenized Text DF Size: 29652


In [3]:
#preprocessing
def preprocess_text(text):

  #lowercase text
  text_preprocessed = text.lower()
  #remove punctuation
  text_preprocessed = re.sub(r'[^a-zA-Z ]+', '', text_preprocessed)
  #tokenize for stopword removal
  text_preprocessed = word_tokenize(text_preprocessed)
  #remove stopwords
  text_preprocessed = [word for word in text_preprocessed if word not in stopwords.words('english')]
  #join to make string again
  #text_preprocessed = (" ").join(text_preprocessed)

  return text_preprocessed

In [4]:
%%time
model_data['tokens'] = model_data['description'].apply(lambda x: preprocess_text(x))

CPU times: user 5min 16s, sys: 22.8 s, total: 5min 39s
Wall time: 5min 39s


In [5]:
model_data['liststring'] = [','.join(map(str, l)) for l in model_data['tokens']]

## Create Synsets

In [6]:
def create_synsets(event):
  
  synonym = [] 
    
  for synset in wordnet.synsets(event): 
      for i in synset.lemmas(): 
          synonym.append(i.name()) # add all the synonyms available 
    
  return synonym

In [9]:
#creating library dataframe

#creating library dataframe


life_events = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship']

#create synsets for select events where decent synsets exist
relationship_list = create_synsets('go_steady') + ['relationship', 'kinship', 'romance', 'dating']
marriage_list = create_synsets('marriage')
wedding_list = create_synsets('wedding') + ['matrimony']

#replace underscore (_) with space
relationship_list = [i.replace("_", " ") for i in relationship_list]
marriage_list = [i.replace("_", " ") for i in marriage_list]
wedding_list = [i.replace("_", " ") for i in wedding_list]

#remove certain words
wedding_list.remove('tie')
wedding_list.remove('marriage')
relationship_list.remove('see')

synsets = [['college', 'university', 'campus', 'academia', 'professor', 'colleges', 'universities', 'professors'], 
           relationship_list, 
           ['breakup', 'break up', 'split up', 'broken up', 'dumped', 'breaks up', 'splits up', 'dumps', 'dump', 'breaks off', 'break off'], 
           ['divorce', 'divorced', 'divorces'], 
           wedding_list,  
           ['death', 'decease', 'deceased', 'dying'],
           ['family', 'mother', 'father', 'brother', 'sister', 'mom', 'dad'],
           ['friends', 'friend', 'friendship', 'friendships']]
           #marriage_list]

# Create the pandas DataFrame with column name is provided explicitly
df_lib = pd.DataFrame(life_events, columns=['life_event'])
df_lib['synsets'] = synsets
 
# print dataframe.
df_lib

Unnamed: 0,life_event,synsets
0,university,"[college, university, campus, academia, profes..."
1,relationships,"[go steady, go out, date, relationship, kinshi..."
2,break ups,"[breakup, break up, split up, broken up, dumpe..."
3,divorce,"[divorce, divorced, divorces]"
4,wedding,"[wedding, wedding ceremony, nuptials, hymeneal..."
5,death,"[death, decease, deceased, dying]"
6,family,"[family, mother, father, brother, sister, mom,..."
7,friendship,"[friends, friend, friendship, friendships]"


## Corex Topic

In [10]:
# Transform data into a sparse matrix
vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True, ngram_range=(1,2))
doc_word = vectorizer.fit_transform(model_data.liststring)
doc_word = ss.csr_matrix(doc_word)

doc_word.shape # n_docs x m_words

(29652, 20000)

In [11]:
# Get words that label the columns (needed to extract readable topics and make anchoring easier)
words = list(np.asarray(vectorizer.get_feature_names()))



In [12]:
#anchor words
# Anchor 'nasa' and 'space' to first topic, 'sports' and 'stadium' to second topic, so on...
anchor_words = synsets

anchored_topic_model = ct.Corex(n_hidden=8, seed=2)
anchored_topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=6);



In [13]:
for n in range(len(anchor_words)):
    topic_words,_,_ = zip(*anchored_topic_model.get_topics(topic=n))
    print('{}: '.format(n) + ', '.join(topic_words))

0: college, university, professor, murder, killer, campus, detective, case, police, crime
1: romance, relationship, new york, york, date, york times, dating, times bestselling, times, bestselling
2: novel, story, characters, fiction, stories, readers, american, literary, classic, written
3: shes, hes, doesnt, school, divorce, like, going, isnt, things, theres
4: marry, wedding, love, life, woman, heart, past, wed, man, years
5: death, dying, ancient, evil, power, world, battle, save, deceased, war
6: family, father, mother, brother, sister, dad, mom, older, younger, mothers
7: friends, friend, friendship, best friend, friendships, best, best friends, new friends, old friend, family friends


In [14]:
print(anchored_topic_model.labels.shape)

(29652, 8)


In [16]:
print(anchored_topic_model.labels[:10])

[[False False False False  True  True False False]
 [False False False  True False  True False  True]
 [False False  True False False False False False]
 [False  True  True False False False False False]
 [False False False False False False False False]
 [False False  True False  True  True  True False]
 [False False  True False False False False False]
 [False False  True False False  True False False]
 [False False  True False False  True False False]
 [False False False False False False False False]]


In [17]:
print(anchored_topic_model.p_y_given_x[:10])

[[1.30952189e-04 2.00749196e-05 8.20213684e-06 4.24435024e-02
  8.62059347e-01 9.99999000e-01 1.48496407e-06 1.98047821e-05]
 [3.08545721e-03 4.42700711e-04 1.09199225e-06 7.81038022e-01
  2.08020471e-03 8.01490132e-01 1.19204393e-06 9.99999000e-01]
 [1.82770177e-05 2.09823777e-04 9.99999000e-01 1.73319490e-06
  6.87926195e-02 1.00000000e-06 1.46244360e-05 7.15620585e-06]
 [2.59722232e-06 9.99999000e-01 9.99999000e-01 1.00000000e-06
  3.12337559e-04 1.38263755e-06 1.27813624e-05 6.97182965e-06]
 [2.59763771e-06 1.84193587e-05 1.00000000e-06 6.43320361e-04
  5.51390417e-05 1.00000000e-06 2.90482642e-06 7.10561609e-06]
 [8.42233936e-06 6.41905813e-03 9.99999000e-01 1.25675410e-06
  9.98863277e-01 9.99999000e-01 9.99999000e-01 6.96088897e-06]
 [1.70090214e-04 7.73729462e-05 9.99999000e-01 1.00000000e-06
  1.68550454e-06 4.07162672e-04 6.79327148e-05 1.00000000e-06]
 [2.64699858e-06 1.94959923e-05 9.99999000e-01 1.32745474e-05
  8.92052557e-06 9.89343319e-01 2.90118633e-06 7.07703870e-06]


In [19]:
df_results = pd.DataFrame(anchored_topic_model.p_y_given_x)

In [20]:
df_results.columns = ['university', 'relationships', 'break ups', 'divorce', 'wedding', 
               'death', 'family', 'friendship']