In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gensim
Successfully installed gensim-4.3.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install altair

Collecting altair
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.2/731.2 kB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: altair
Successfully installed altair-5.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import KFold
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.word2vec import Word2Vec
import altair as alt
from scipy import stats


from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.test.utils import common_texts
from gensim.models.tfidfmodel import TfidfModel

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


read in data

In [10]:
# dataset of articles in the MIND dataset
news = pd.read_table('news_cleaned.tsv')

text preprocessing

In [34]:
# remove puncutation from text
def remove_punct(x):

    '''
    This function removes puncutation from text.

    x: text to be processed
    '''

    import re
    return ' '.join([ re.sub('[^A-Za-z0-9]+', '', word).strip()  for word in x.split()])

# remove numbers from text
def removeNumbers(s):
    
    '''
    This function removes numbers text.

    s: text to be processed
    '''
    
    result = ''
    
    for i in s:
        if(not i.isdigit()):
            result = result + i
            
    return(result.strip())

# returns list of stopwords
def getStopWords(path):

    '''
    This function returns a dataframe of stop words from the specified path.

    path: path to the table of stop words
    '''
    
    stop_words = pd.read_csv(path)

    stop = stop_words.groupby(['stop_words'])['stop'].agg(stop = 'min').reset_index()

    stop.stop = stop.stop.astype(int)

    stop = stop[stop.stop == 1]

    stop.stop_words = [w.strip() for w in stop.stop_words]

    return stop

# remove stop words from text
def tokenize(x, stop_words):

    '''
    This function removes stop words from text.

    x: text to be processed
    stop_words: dataframe of stopwords
    '''

    try:
        tokens = x.split()
        tokens = [item for item in tokens if item not in stop_words.stop_words.tolist() and len(item) > 2]
        return (" ".join(tokens))
    
    except:
        return ''


# lemmatize text
def lemma(phrase, onlyNouns = False):

    '''
    This function lemmatizes text.

    phrase: text to be lemmatized
    onlyNouns: boolean. Indicates if you want to include only nouns
    '''
        
    from nltk.stem import WordNetLemmatizer 
    from nltk.tokenize import word_tokenize
    from nltk import pos_tag

    wordnet = WordNetLemmatizer()

    tokens = []
    for token,tag in pos_tag(word_tokenize(phrase)):
        pos=tag[0].lower()
        try:
          if onlyNouns:
            if pos == 'n':
              tokens = tokens + [wordnet.lemmatize(token,pos)]
          else:
            tokens = tokens + [wordnet.lemmatize(token,pos)]
            
        except:
          tokens = tokens + [token]
    
    return " ".join(tokens)

# this function will convert text to lower case, remove puncuation, remove numbers, remove stop words, and lemmatize text
# can set parameter to False if not wanting to do that specific task. For example, if you do not want to lematize the text, set lemmatizeTxt to False
def cleanText(txt, 
              removePunct = True,
              removeNums = True,
              tokenizeTxt = True,
              stop_words = None,
              lemmatizeTxt = True,
              returnList = False,
              onlyNouns = False
              ):

              '''
              removePunct: boolean. Indicates if you want to remove punctuation
              removeNums: boolean. Indicates if you want to remove numbers
              tokenizeTxt: boolean. Indicates if you want to remove stop words
              stop_words: list. List of stopwords
              lemmatizeTxt: boolean. Indicates if you want to lemmatize the text
              returnList: boolean. Indicates if you want the output to be a list
              onlyNouns: boolean. Indicates if you want to include only nouns
              '''
              
              
              if txt is None:
                return None
              else:

                cleanedText = txt.lower()
      
                if removePunct:
                  cleanedText = remove_punct(cleanedText)

                if removeNums:
                  cleanedText = removeNumbers(cleanedText)

                if tokenizeTxt:
                  cleanedText = tokenize(cleanedText, stop_words)

                if lemmatizeTxt:
                  cleanedText = lemma(cleanedText, onlyNouns)
                
                if tokenizeTxt:
                  cleanedText = tokenize(cleanedText, stop_words)

                if returnList:
                  return list(cleanedText.split())
                  
                else:
                  return cleanedText

In [37]:
# table of stopwords
stop_words = getStopWords('stop_words.csv')

# clean text
news['title_cleaned'] = news.title.apply(cleanText 
                                         ,removePunct = True
                                         , removeNums = True
                                         , tokenizeTxt = True
                                         , stop_words = stop_words
                                         , lemmatizeTxt = True
                                         , returnList = False
                                         , onlyNouns = False)

In [40]:
news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,set,num_of_clicks,title_cleaned
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],train,,brand queen elizabeth prince charles prince ph...
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",train,,worst habit belly fat
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",train,,cost trump aid freeze trench ukraine war
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",train,,nba affect mental
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",train,269.0,rid skin tag accord dermatologist


In [43]:
# train set
train = news[news['set'] == 'train']

# validation set
val = news[news['set'] == 'validation']

# create tf-idf vectorizer using train data
train_articles = train.title_cleaned.apply(lambda x: x.split())
train_dictionary = Dictionary(train_articles)
train_dictionary.filter_extremes(no_below=20, no_above=0.6)
train_corpus = [train_dictionary.doc2bow(text) for text in train_articles]
tfidf = TfidfModel(train_corpus)
train_corpus = [tfidf[c] for c in train_corpus]

# apply tf-idf tranformation on whole data set
articles = news.title_cleaned.apply(lambda x: x.split())
dictionary = Dictionary(articles)
dictionary.filter_extremes(no_below=20, no_above=0.6)
corpus = [tfidf[dictionary.doc2bow(text)] for text in articles]


fit a word to vec model

In [46]:
w2c = Word2Vec(train.title_cleaned.apply(lambda x: x.split()), max_vocab_size=10000, min_count=1, seed=2)

def get_w2v_vec(word):
    '''
    This function returns the numerical vector representation of a word

    word: string. word to be processed
    '''

    try:
        return w2c.wv[word]

    except:
        pass

calculate median topic coherence for different number of topics

In [49]:
def get_coherence(topic):
    
    '''
    This get the topic cohereance

    Parameters
    ----------
    topic : list
        list of top words in the topic

    Returns
    -------
    average pairwise cosine similarity

    '''

    t1_vec = np.array([get_w2v_vec(w) for w in topic])
    
    if len(t1_vec) == 0:
        return 0

    t1_pairwise_sim = np.array([ [cosine_similarity(x.reshape(1,-1), vec.reshape(1,-1))[0][0] for x in t1_vec] 
                                    for vec in t1_vec ])

    np.fill_diagonal(t1_pairwise_sim, 0)

    return np.mean(t1_pairwise_sim)


def coherence_lda():

    '''
    This function get the median topic coherance

    Parameters
    ----------

    Returns
    -------
    median topic coherance

    '''

    median_coherence_scores = []
    for n_top in range(2,11):

        # train LDA model with n_top topics
        lda = LdaModel(train_corpus, num_topics=n_top, random_state=0)
        topic_models = lda.get_topics()

        # get top 10 words for each topic
        topics_topwords = []
        for topic_idx, topic in enumerate(topic_models):
            term_list = [
                    train_dictionary[i] for i in topic.argsort()[: -10 - 1 : -1]
                ]

            topics_topwords.append(term_list)
            

        # compute topic coherence, on top 10 words, for each topic
        coherence_scores = [get_coherence(t) for t in topics_topwords]

        # compute median coherence over all topics
        median_coherence_score = np.median(coherence_scores)
        median_coherence_scores = median_coherence_scores + [median_coherence_score]
        
    return median_coherence_scores
    

lda_coherance_scores = coherence_lda()

In [52]:
plotDf = pd.DataFrame({
                        'num_topics':range(2,11)
                        ,'median_topic_coherence':lda_coherance_scores
                        })

chart = alt.Chart(plotDf)\
            .mark_bar()\
            .encode(
                    x=alt.X('num_topics:O', title = 'Number of Topics'),
                    y=alt.Y('median_topic_coherence:Q', title = 'Median Topic Coherence')
                )\
            .properties(
                width=400,
                height=300,
                title={'text':'Median Topic Coherence'
                       ,'subtitle':'LDA Model fit on Training Set'
                    }
            )\
            .configure_axis(
                            grid=False
                            )          


print(plotDf)
chart    


   num_topics  median_topic_coherence
0           2                0.536508
1           3                0.654710
2           4                0.673613
3           5                0.690154
4           6                0.725466
5           7                0.734283
6           8                0.746015
7           9                0.804531
8          10                0.782791


fit LDA model

In [55]:
# lda = LatentDirichletAllocation(n_components=3, random_state=0)
# lda.fit(news_tdif)
# topic_models = lda.components_

lda = LdaModel(corpus, num_topics=4, random_state=0)
topic_models = lda.get_topics()

get the top documents for each topic

In [58]:
docWeightDf = pd.DataFrame(lda.get_document_topics(corpus)
                           ,columns=['Topic {}'.format(i+1) for i in range(4)]
                           )

for c in docWeightDf.columns:
    docWeightDf[c] = docWeightDf[c].apply(lambda x: x[1])


docWeightDf['Document'] = news['title']


def plotDocTopic(topic):

    '''
    This function plots the top documents for a given topic

    Parameters
    ----------
    topic : str
        The name of the topic for which to plot the top documents

    Returns
    -------
    altair chart

    '''
    # sort by topic weight
    df = docWeightDf.sort_values(by = topic, ascending=False).head(10)
    
    # create bar chart
    chart = alt.Chart(df)\
                .mark_bar()\
                .encode(
                        x=alt.X('{}:Q'.format(topic), title = 'Topic Weight'),
                        y=alt.Y('Document:N', title = None,sort='-x', axis = alt.Axis(labelLimit=0))
                    )\
                .properties(
                    width=100,
                    height=100,
                    title={'text': topic
                            }
                )    

    return chart

def plotLdaDocs():


    '''
    This function gets the top documents for a set of topics

    Parameters
    ----------

    Returns
    -------
    list of altair charts

    '''

    charts = []
    topics = [c for c in docWeightDf.columns if c != 'Document']
    for top in np.unique(topics):
        charts.append(plotDocTopic(top))

    return charts


result = plotLdaDocs()

chart = alt.vconcat(*result)\
            .configure_axis(
                            grid=False
                            )\
            .properties(
                    title={'text':'Latent Dirichlet Allocation (LDA)'
                            ,'subtitle': 'Top Articles for Each Topic'
                            ,'anchor':'middle'
                            , 'offset': 15}
                )   

chart

get the top words for each topic

In [64]:
topWordsPerTopic = pd.DataFrame()
for topic_idx, topic in enumerate(topic_models):
    term_list = {
            dictionary[i]:topic[i] for i in topic.argsort()[: -8 - 1 : -1]
    }

    tempDf = pd.DataFrame({'word':list(term_list.keys())
                            ,'weight':list(term_list.values())
                            }
                            )

    tempDf['topic'] = topic_idx + 1
    
    topWordsPerTopic = pd.concat([topWordsPerTopic, tempDf])

In [67]:
topWordsPerTopic.head()

Unnamed: 0,word,weight,topic
0,crash,0.02067,1
1,school,0.019252,1
2,kill,0.017366,1
3,police,0.016484,1
4,county,0.016444,1


In [70]:
def plotTopic(topic_idx):
   '''
    This function gets the top words for a topic

    Parameters
    ----------
    topic_idx: topic of interest

    Returns
    -------
    altair chart

    '''

    chart = chart = alt.Chart(topWordsPerTopic[topWordsPerTopic['topic'] == topic_idx])\
                .mark_bar()\
                .encode(
                        x=alt.X('weight:Q', title = 'Topic Weight'),
                        y=alt.Y('word:N', title = None, sort = '-x')
                    )\
                .properties(
                    width=100,
                    height=100,
                    title={'text':'Topic {}'.format(topic_idx)
                            }
                )    

    return chart

def plotLdaTopics():
   '''
    This function gets the top words for each topic

    Parameters
    ----------
    
    Returns
    -------
    list of altair charts

    '''
    charts = []
    for top in np.unique(topWordsPerTopic['topic']):
        charts.append(plotTopic(top))

    return charts



result = plotLdaTopics()  

chart = alt.hconcat(*result[0:10])\
            .configure_axis(
                            grid=False
                            )\
            .properties(
                    title={'text':'Latent Dirichlet Allocation (LDA)'
                            ,'subtitle': 'Top Words for Each Topic'
                            ,'anchor':'middle'
                            , 'offset': 15}
                )    

chart                            

sensitivity analysis - doc_topic_prior (theta)

In [73]:
# def doc_top_prior_sensitivity_lda(rangeList
#                                   ,n_trials = 5):


#     # this dataframe will hold the results
#     trackDf = pd.DataFrame({
#                             'param_value':[]
#                             ,'trial':[]
#                             ,'median_topic_coherence_score':[]
#                             })

#     # test each value of parameter
#     for x in rangeList:

#         # num of trials
#         for y in range(1,n_trials + 1):
            
#             print("param value: {}, trial: {}".format(x,y))

#             # train LDA model with fixed number of topics. Set doc_topic_prior = x
#             lda = LatentDirichletAllocation(n_components=7
#                                             ,doc_topic_prior=x)

#             lda.fit(news_tdif_train)
#             topic_models = lda.components_

#             # get top 10 words for each topic
#             topics_topwords = []
#             for topic_idx, topic in enumerate(topic_models):
#                 term_list = [
#                         feature_names[i] for i in topic.argsort()[: -10 - 1 : -1]
#                     ]

#                 topics_topwords.append(term_list)
            

#             # compute topic coherence, on top 10 words, for each topic
#             coherence_scores = [get_coherence(t) for t in topics_topwords]

#             # compute median coherence over all topics
#             median_coherence_score = np.median(coherence_scores)
            
#             tempDf = pd.DataFrame({
#                             'param_value':[x]
#                             ,'trial':[y]
#                             ,'median_topic_coherence_score':[median_coherence_score]
#                             })

#             trackDf = pd.concat([trackDf, tempDf])

#     return trackDf


In [76]:
def doc_top_prior_sensitivity_lda(rangeList
                                  ,n_trials = 5):

   '''
    This function tests the LDA model's sensitivity to the parameter alpha

    Parameters
    ----------
    rangelist : list of values to test
    n_trials : number of trials

    Returns
    -------
    dataframe of results

    '''

    # this dataframe will hold the results
    trackDf = pd.DataFrame({
                            'param_value':[]
                            ,'trial':[]
                            ,'median_topic_coherence_score':[]
                            })

    # test each value of parameter
    for x in rangeList:

        # num of trials
        for y in range(1,n_trials + 1):
            
            print("param value: {}, trial: {}".format(x,y))

            # train LDA model with fixed number of topics. Set doc_topic_prior = x
            lda = LdaModel(train_corpus, num_topics=4, alpha=x)
            topic_models = lda.get_topics()

            # get top 10 words for each topic
            topics_topwords = []
            for topic_idx, topic in enumerate(topic_models):
                term_list = [
                        train_dictionary[i] for i in topic.argsort()[: -10 - 1 : -1]
                    ]

                topics_topwords.append(term_list)
            

            # compute topic coherence, on top 10 words, for each topic
            coherence_scores = [get_coherence(t) for t in topics_topwords]

            # compute median coherence over all topics
            median_coherence_score = np.median(coherence_scores)
            
            tempDf = pd.DataFrame({
                            'param_value':[x]
                            ,'trial':[y]
                            ,'median_topic_coherence_score':[median_coherence_score]
                            })

            trackDf = pd.concat([trackDf, tempDf])

    return trackDf


In [79]:
default = 1/4
paramList = [default, 0.05, 0.1, 0.5, 1, 5, 10]
trackDf = doc_top_prior_sensitivity_lda(paramList, n_trials = 50)

# write results
trackDf.to_csv('lda_sensitivity_gensim.tsv', index=False, sep='\t')

param value: 0.25, trial: 1
  trackDf = pd.concat([trackDf, tempDf])
param value: 0.25, trial: 2
param value: 0.25, trial: 3
param value: 0.25, trial: 4
param value: 0.25, trial: 5
param value: 0.25, trial: 6
param value: 0.25, trial: 7
param value: 0.25, trial: 8
param value: 0.25, trial: 9
param value: 0.25, trial: 10
param value: 0.25, trial: 11
param value: 0.25, trial: 12
param value: 0.25, trial: 13
param value: 0.25, trial: 14
param value: 0.25, trial: 15
param value: 0.25, trial: 16
param value: 0.25, trial: 17
param value: 0.25, trial: 18
param value: 0.25, trial: 19
param value: 0.25, trial: 20
param value: 0.25, trial: 21
param value: 0.25, trial: 22
param value: 0.25, trial: 23
param value: 0.25, trial: 24
param value: 0.25, trial: 25
param value: 0.25, trial: 26
param value: 0.25, trial: 27
param value: 0.25, trial: 28
param value: 0.25, trial: 29
param value: 0.25, trial: 30
param value: 0.25, trial: 31
param value: 0.25, trial: 32
param value: 0.25, trial: 33
param value

In [13]:
trackDf = pd.read_table('lda_sensitivity_gensim.tsv')

In [16]:
trackDf.head()

Unnamed: 0,param_value,trial,median_topic_coherence_score
0,0.25,1.0,0.70786
1,0.25,2.0,0.717357
2,0.25,3.0,0.684188
3,0.25,4.0,0.712702
4,0.25,5.0,0.722409


In [19]:
def getConfInt(df):
    ''' this function calculates 
        the confidence interval for the 
        average of the median topic coherance score
        
        returns: confidence interval and mean
        '''

    dof = len(df) - 1
    m = df['median_topic_coherence_score'].mean()
    se = stats.sem(df['median_topic_coherence_score'])

    ci = stats.t.interval(0.95
                        ,df=dof
                        , loc=m
                        , scale=se
                        )

    return ci,m


confIntDf = pd.DataFrame(
                trackDf\
                .groupby('param_value')\
                .apply(getConfInt)
                )\
                .reset_index()\
                .rename(columns={0:'result'})

confIntDf['mean'] = confIntDf['result'].apply(lambda x: x[1])

confIntDf['lower'] = confIntDf['result'].apply(lambda x: x[0][0])
confIntDf['upper'] = confIntDf['result'].apply(lambda x: x[0][1])

confIntDf.drop(columns=['result'],inplace=True)

confIntDf['param_value'] = confIntDf['param_value'].apply(lambda x: 'Default (1/num_topics)' if round(x,5) == round(default,5) else x)

In [22]:
confIntDf.head()

Unnamed: 0,param_value,mean,lower,upper
0,0.05,0.687859,0.676785,0.698934
1,0.1,0.705863,0.693686,0.71804
2,Default (1/num_topics),0.693652,0.684753,0.70255
3,0.5,0.701149,0.691508,0.710791
4,1.0,0.644839,0.632626,0.657051


In [25]:
# thischart marks the lower and upper bound of the confidence interval
interval_chart_95 = alt.Chart(confIntDf)\
                        .mark_rule()\
                        .encode(
                                y = alt.Y('param_value:O'),
                                x = alt.X("lower", scale=alt.Scale(domain=[0.4, 0.75])),
                                x2 = "upper"
                                )

# thischart marks the average of the median topic coherance score
point_chart = alt.Chart(confIntDf)\
                  .mark_point(color="blue",size=30)\
                    .encode(
                            y = alt.Y('param_value:O', title = 'Parameter Value'),
                            x = alt.X("mean", title=['Average of the Median Topic Coherence Score'
                                                        ,'(lower95, upper95)']
                                      ,scale=alt.Scale(domain=[0.4, 0.75])),
                            )

# concat the two prior charts together
doc_top_prior_sensitivity = (point_chart + interval_chart_95)\
                                .properties(
                                    width=400,
                                    height=300,
                                    title={'text':'LDA Sensitivity: Alpha'
                                            ,'subtitle':'Number of Trials = 50, Number of Topics = 4'
                                        }
                                )\
                                .configure_axis(
                                        grid=False
                                )

doc_top_prior_sensitivity

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c8be285d-9d64-40ba-873b-a353fe8ae087' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>