In [1]:
pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: gensim
Successfully installed gensim-4.3.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install altair

Collecting altair
  Downloading altair-5.5.0-py3-none-any.whl (731 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.2/731.2 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: altair
Successfully installed altair-5.5.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')


from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.word2vec import Word2Vec
import altair as alt

from sklearn import decomposition

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


read in data

In [8]:
# dataset of articles in the MIND dataset
news = pd.read_table('news_cleaned.tsv')

text preprocessing

In [11]:
# remove puncutation from text
def remove_punct(x):

    '''
    This function removes puncutation from text.

    x: text to be processed
    '''

    import re
    return ' '.join([ re.sub('[^A-Za-z0-9]+', '', word).strip()  for word in x.split()])

# remove numbers from text
def removeNumbers(s):

    '''
    This function removes numbers text.

    s: text to be processed
    '''

    result = ''
    
    for i in s:
        if(not i.isdigit()):
            result = result + i
            
    return(result.strip())

# returns list of stopwords
def getStopWords(path):

    '''
    This function returns a dataframe of stop words from the specified path.

    path: path to the table of stop words
    '''

    stop_words = pd.read_csv(path)

    stop = stop_words.groupby(['stop_words'])['stop'].agg(stop = 'min').reset_index()

    stop.stop = stop.stop.astype(int)

    stop = stop[stop.stop == 1]

    stop.stop_words = [w.strip() for w in stop.stop_words]

    return stop

# remove stop words from text
def tokenize(x, stop_words):

    '''
    This function removes stop words from text.

    x: text to be processed
    stop_words: dataframe of stopwords
    '''

    try:
        tokens = x.split()
        tokens = [item for item in tokens if item not in stop_words.stop_words.tolist() and len(item) > 2]
        return (" ".join(tokens))
    
    except:
        return ''


# lemmatize text
def lemma(phrase, onlyNouns = False):
    
    '''
    This function lemmatizes text.

    phrase: text to be lemmatized
    onlyNouns: boolean. Indicates if you want to include only nouns
    '''

    from nltk.stem import WordNetLemmatizer 
    from nltk.tokenize import word_tokenize
    from nltk import pos_tag

    wordnet = WordNetLemmatizer()

    tokens = []
    for token,tag in pos_tag(word_tokenize(phrase)):
        pos=tag[0].lower()
        try:
          if onlyNouns:
            if pos == 'n':
              tokens = tokens + [wordnet.lemmatize(token,pos)]
          else:
            tokens = tokens + [wordnet.lemmatize(token,pos)]
            
        except:
          tokens = tokens + [token]
    
    return " ".join(tokens)

# this function will convert text to lower case, remove puncuation, remove numbers, remove stop words, and lemmatize text
# can set parameter to False if not wanting to do that specific task. For example, if you do not want to lematize the text, set lemmatizeTxt to False
def cleanText(txt, 
              removePunct = True,
              removeNums = True,
              tokenizeTxt = True,
              stop_words = None,
              lemmatizeTxt = True,
              returnList = False,
              onlyNouns = False
              ):
              
              '''
              removePunct: boolean. Indicates if you want to remove punctuation
              removeNums: boolean. Indicates if you want to remove numbers
              tokenizeTxt: boolean. Indicates if you want to remove stop words
              stop_words: list. List of stopwords
              lemmatizeTxt: boolean. Indicates if you want to lemmatize the text
              returnList: boolean. Indicates if you want the output to be a list
              onlyNouns: boolean. Indicates if you want to include only nouns
              '''
              
              if txt is None:
                return None
              else:

                cleanedText = txt.lower()
      
                if removePunct:
                  cleanedText = remove_punct(cleanedText)

                if removeNums:
                  cleanedText = removeNumbers(cleanedText)

                if tokenizeTxt:
                  cleanedText = tokenize(cleanedText, stop_words)

                if lemmatizeTxt:
                  cleanedText = lemma(cleanedText, onlyNouns)
                
                if tokenizeTxt:
                  cleanedText = tokenize(cleanedText, stop_words)

                if returnList:
                  return list(cleanedText.split())
                  
                else:
                  return cleanedText

In [14]:
# table of stopwords
stop_words = getStopWords('stop_words.csv')

# clean text
news['title_cleaned'] = news.title.apply(cleanText 
                                         ,removePunct = True
                                         , removeNums = True
                                         , tokenizeTxt = True
                                         , stop_words = stop_words
                                         , lemmatizeTxt = True
                                         , returnList = False
                                         , onlyNouns = False)

In [17]:
news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities,set,num_of_clicks,title_cleaned
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[],train,,brand queen elizabeth prince charles prince ph...
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...",train,,worst habit belly fat
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId...",train,,cost trump aid freeze trench ukraine war
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ...",train,,nba affect mental
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...",train,269.0,rid skin tag accord dermatologist


In [20]:
# create TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(
                        max_features=10000,  # only top 10k by freq
                        lowercase=False,  # keep capitalization
                        ngram_range=(1, 1),  # include only 1-word phrases
                        min_df=10,  # note: absolute count of doc
                        max_df=0.95,  # note: % of docs
                        stop_words="english",
                    )

# train set
train = news[news['set'] == 'train']

# validation set
val = news[news['set'] == 'validation']

# fit tdif vectorizer on train set
tfidf_vectorizer.fit(train.title_cleaned)

# use the fitted tdif vectorizer to tansform the whole datset
news_tdif = tfidf_vectorizer.transform(news.title_cleaned)

# use the fitted tdif vectorizer to tansform the train datset
news_tdif_train = tfidf_vectorizer.transform(train.title_cleaned)

# use the fitted tdif vectorizer to tansform the validation datset
news_tdif_val = tfidf_vectorizer.transform(val.title_cleaned)

feature_names = tfidf_vectorizer.get_feature_names_out()

print(news_tdif.shape)

(65238, 3778)


fit a word to vec model

In [23]:
w2c = Word2Vec(train.title_cleaned.apply(lambda x: x.split()), max_vocab_size=10000, min_count=1, seed=2)

def get_w2v_vec(word):
    
    '''
    This function returns the numerical vector representation of a word

    word: string. word to be processed
    '''

    try:
        return w2c.wv[word]

    except:
        pass

calculate median topic coherence for different number of topics

In [26]:
def get_coherence(topic):

    '''
    This get the topic cohereance

    Parameters
    ----------
    topic : list
        list of top words in the topic

    Returns
    -------
    average pairwise cosine similarity

    '''
    
    t1_vec = np.array([get_w2v_vec(w) for w in topic])
    
    if len(t1_vec) == 0:
        return 0

    t1_pairwise_sim = np.array([ [cosine_similarity(x.reshape(1,-1), vec.reshape(1,-1))[0][0] for x in t1_vec] 
                                    for vec in t1_vec ])

    np.fill_diagonal(t1_pairwise_sim, 0)

    return np.mean(t1_pairwise_sim)


def coherence_nmf():

   '''
    This function get the median topic coherance

    Parameters
    ----------

    Returns
    -------
    median topic coherance

    '''
    
    median_coherence_scores = []
    for n_top in range(2,11):

        # train NMF model with n_top topics
        nmf = decomposition.NMF(n_components=n_top, random_state=42, init="nndsvd")
        W = nmf.fit_transform(news_tdif_train)
        H = nmf.components_

        # get top 10 words for each topic
        topics = np.argsort(H, axis=1)[:,:-11:-1]
        topics_topwords = [ list(feature_names[idxs]) for idxs in topics]
            
        # compute topic coherence, on top 10 words, for each topic
        coherence_scores = [get_coherence(t) for t in topics_topwords]

        # compute median coherence over all topics
        median_coherence_score = np.median(coherence_scores)
        median_coherence_scores = median_coherence_scores + [median_coherence_score]
        
    return median_coherence_scores
    

nmf_coherance_scores = coherence_nmf()

In [29]:
plotDf = pd.DataFrame({
                        'num_topics':range(2,11)
                        ,'median_topic_coherence':nmf_coherance_scores
                        })

chart = alt.Chart(plotDf)\
            .mark_bar()\
            .encode(
                    x=alt.X('num_topics:O', title = 'Number of Topics'),
                    y=alt.Y('median_topic_coherence:Q', title = 'Median Topic Coherence')
                )\
            .properties(
                width=400,
                height=300,
                title={'text':'Median Topic Coherence'
                       ,'subtitle':'NMF Model fit on Training Set'
                    }
            )\
            .configure_axis(
                            grid=False
                            )         

chart    

In [32]:
plotDf

Unnamed: 0,num_topics,median_topic_coherence
0,2,0.739063
1,3,0.865267
2,4,0.836722
3,5,0.808176
4,6,0.836722
5,7,0.847612
6,8,0.846516
7,9,0.847612
8,10,0.8567


fit NMF model

In [35]:
nmf = decomposition.NMF(n_components=3, random_state=42, init="nndsvd")
W = nmf.fit_transform(news_tdif)
H = nmf.components_

get the top words for each topic

In [38]:

topWordsPerTopic = pd.DataFrame()
for topic_idx, topic in enumerate(H):
    term_list = {
            feature_names[i]:topic[i] for i in topic.argsort()[: -8 - 1 : -1]
    }

    tempDf = pd.DataFrame({'word':list(term_list.keys())
                            ,'weight':list(term_list.values())
                            }
                            )

    tempDf['topic'] = topic_idx + 1
    
    topWordsPerTopic = pd.concat([topWordsPerTopic, tempDf])



In [41]:
topWordsPerTopic.head(10)

Unnamed: 0,word,weight,topic
0,police,2.304763,1
1,crash,2.030973,1
2,kill,1.631405,1
3,shoot,1.010518,1
4,car,0.946543,1
5,county,0.921998,1
6,home,0.911496,1
7,yearold,0.868204,1
0,trump,2.943641,2
1,impeachment,1.640568,2


In [44]:
def plotTopWordsTopicNmf(topic_idx):

  '''
    This function gets the top words for a topic

    Parameters
    ----------
    topic_idx: topic of interest

    Returns
    -------
    altair chart

    '''
    
    chart = chart = alt.Chart(topWordsPerTopic[topWordsPerTopic['topic'] == topic_idx])\
                .mark_bar()\
                .encode(
                        x=alt.X('weight:Q', title = 'Topic Weight'),
                        y=alt.Y('word:N', title = None, sort = '-x')
                    )\
                .properties(
                    width=100,
                    height=100,
                    title={'text':'Topic {}'.format(topic_idx)
                            }
                )    

    return chart

def plotNmfTopics():

   '''
    This function gets the top words for each topic

    Parameters
    ----------
    
    Returns
    -------
    list of altair charts

    '''
    
    charts = []
    for top in np.unique(topWordsPerTopic['topic']):
        charts.append(plotTopWordsTopicNmf(top))

    return charts



result = plotNmfTopics()

chart = alt.hconcat(*result)\
            .configure_axis(
                            grid=False
                            )\
            .properties(
                    title={'text':'Non-negative Matrix Factorization (NMF)'
                            ,'subtitle': 'Top Words for Each Topic'
                            ,'anchor':'middle'
                            , 'offset': 15}
                )    

chart                           

get the top articles for each topic

In [55]:
topDocsPerTopic = pd.DataFrame(W, columns=['Topic 1', 'Topic 2', 'Topic 3'])
topDocsPerTopic['Document'] = news['title']
topDocsPerTopic.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Document
0,0.0016,0.000481,0.001002,"The Brands Queen Elizabeth, Prince Charles, an..."
1,0.000441,3.9e-05,0.001096,50 Worst Habits For Belly Fat
2,0.0,0.07227,6.4e-05,The Cost of Trump's Aid Freeze in the Trenches...
3,0.000274,0.0002,0.003867,I Was An NBA Wife. Here's How It Affected My M...
4,0.000829,0.000289,0.001253,"How to Get Rid of Skin Tags, According to a De..."


In [109]:
def plotTopDocsNmf(topic_idx, top_n):

   '''
    This function plots the top documents for a given topic

    Parameters
    ----------
    topic_idx : str
        The name of the topic for which to plot the top documents
    top_n : int
        The number of top documents to plot

    Returns
    -------
    altair chart

    '''
    
    plotDf = topDocsPerTopic.sort_values(by = topic_idx, ascending = False).head(top_n)

    chart = chart = alt.Chart(plotDf)\
                .mark_bar()\
                .encode(
                        x=alt.X('{}:Q'.format(topic_idx), title = 'Document Weight'),
                        y=alt.Y('Document:N', title=None, sort = '-x', axis = alt.Axis(labelLimit=0))
                    )\
                .properties(
                    width=200,
                    height=100,
                    title={'text': topic_idx
                            ,'subtitle':'Top Documents'}
                )    

    return chart

def plotNmfDoc(columns, top_n):

    '''
    This function gets the top documents for a set of topics

    Parameters
    ----------

    Returns
    -------
    list of altair charts

    '''
    
    charts = []
    for top in columns:
        charts.append(plotTopDocsNmf(top, top_n))

    return charts


result = plotNmfDoc([c for c in topDocsPerTopic.columns if c != 'Document'], 8)

chart = alt.vconcat(*result)\
            .configure_axis(
                            grid=False
                            )\
            .properties(
                    title={'text':'Non-negative Matrix Factorization (NMF)'
                            ,'subtitle': 'Top Articles for Each Topic'
                            ,'anchor':'middle'
                            , 'offset': 15}
                )   

chart                           

sensitivity analysis - alpha_W

- Constant that multiplies the regularization terms of W. Set it to zero (default) to have no regularization on W

In [64]:

def sensitivity_nmf(rangeList):

   '''
    This function tests the LNMFDA model's sensitivity to the parameter alpha_W

    Parameters
    ----------
    rangelist : list of values to test

    Returns
    -------
    dataframe of results

    '''

    # this dataframe will hold the results
    trackDf = pd.DataFrame({
                            'param_value':[]
                            ,'median_topic_coherence_score':[]
                            })

    # test each value of parameter
    for x in rangeList:

        # train NMF model with a fixed number of topics. Set alpha_W = x
        nmf = decomposition.NMF(n_components=3, init="nndsvd", alpha_W = x)
        W = nmf.fit_transform(news_tdif_train)
        H = nmf.components_

        # get top 10 words for each topic
        topics = np.argsort(H, axis=1)[:,:-11:-1]
        topics_topwords = [ list(feature_names[idxs]) for idxs in topics]

        # compute topic coherence, on top 10 words, for each topic
        coherence_scores = [get_coherence(t) for t in topics_topwords]

        # compute median coherence over all topics
        median_coherence_score = np.median(coherence_scores)
            
        tempDf = pd.DataFrame({
                            'param_value':[x]
                            ,'median_topic_coherence_score':[median_coherence_score]
                            })

        trackDf = pd.concat([trackDf, tempDf])

    return trackDf


In [67]:
default = 0
paramList = [default, 0.05, 0.1, 0.5, 1, 5, 10, 15,20,25]
trackDf = sensitivity_nmf(paramList)
trackDf['param_value'] = trackDf['param_value'].apply(lambda x: 'Default (0)' if x == 0 else x)

  trackDf = pd.concat([trackDf, tempDf])


In [70]:
trackDf

Unnamed: 0,param_value,median_topic_coherence_score
0,Default (0),0.866241
0,0.05,0.722586
0,0.1,0.722586
0,0.5,0.722586
0,1.0,0.722586
0,5.0,0.722586
0,10.0,0.722586
0,15.0,0.722586
0,20.0,0.722586
0,25.0,0.722586


In [73]:
# this chart marks the median topic coherance score
chart = alt.Chart(trackDf)\
                  .mark_bar(color="blue",size=30)\
                    .encode(
                            y = alt.Y('median_topic_coherence_score', title = 'Median Topic Coherence Score'),
                            x = alt.X("param_value:O"
                                        , title='Parameter Value'
                                      )
                    )

# format the chart
nmf_sensitivity = chart\
                    .properties(
                                width=400,
                                height=300,
                                title={'text':'NMF: alpha_W Sensitivity'
                                       ,'subtitle': ['The parameter alpha_W is a constant'
                                                     ,'that multiplies the regularization terms of W.']
                                    }
                                )\
                                .configure_axis(
                                        grid=False
                                )

nmf_sensitivity

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=c8be285d-9d64-40ba-873b-a353fe8ae087' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>