In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as sts

from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer


plt.style.use('ggplot')
%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
df = pd.read_pickle('data/articles.pkl')

In [4]:
df.head(2)

Unnamed: 0,document_type,web_url,lead_paragraph,abstract,snippet,news_desk,word_count,source,section_name,subsection_name,_id,pub_date,print_page,headline,content
0,article,http://www.nytimes.com/2013/10/03/sports/footb...,You would think that in a symmetric zero-sum s...,,You would think that in a symmetric zero-sum s...,Sports,347,The New York Times,Sports,Pro Football,524d4e3a38f0d8198974001f,2013-10-03T00:00:00Z,,Week 5 Probabilities: Why Offense Is More Impo...,the original goal building model football fore...
1,article,http://www.nytimes.com/2013/10/03/us/new-immig...,House Democrats on Wednesday unveiled an immig...,House Democrats unveil immigration bill that p...,House Democrats on Wednesday unveiled an immig...,National,83,The New York Times,U.S.,,524cf71338f0d8198973ff7b,2013-10-03T00:00:00Z,21.0,New Immigration Bill Put Forward,house unveiled immigration bill provides path ...


In [32]:
corpus = df['content'].values

In [34]:
corpus.shape

(1405,)

In [45]:
0.03 * np.log(100/50)

0.020794415416798356

In [43]:
0.03 * np.log(100/1)

0.13815510557964275

In [47]:
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english',
                                 max_df=0.95, min_df=0.05)
    
# X is shape (m,p)
X = vectorizer.fit_transform(corpus).toarray()


array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.05099065, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.10333484, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06162352, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [28]:
def nmf_top_words_from_topics(corpus=corpus, max_features=5000, n_topics=5, n_words=10, section=None):
    
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english',
                                 max_df=0.95, min_df=0.15)
    
    # X is shape (m,p)
    X = vectorizer.fit_transform(corpus).toarray()
    
    # List of words from tfidf -> length=max_features
    feat_names = vectorizer.get_feature_names() 
    
    # Fit NMF to X
    nmf = NMF(n_components=n_topics, random_state=1,
          alpha=.1, l1_ratio=.5).fit(X)
    
    # W matrix maps each row from input matrix to topic vector -> shape(m, n_topics)
    W = nmf.transform(X)
    
    # For each row, grab the index column with highest value
    labels = W.argmax(1)
    
    # topics is the H matrix, mapping each topic to feature -> shape (n_topics, p)
    topics = nmf.components_
    
    for i in range(len(topics)):
        topidxs = np.argsort(topics[i])[::-1][:n_words]
        toplist = [feat_names[idx] for idx in topidxs]
        topwords = ' '.join(toplist)
        print(f'*****Topic: {i}*****\n**Top Words:\n{topwords}\n')
        
        if section:
            df1 = df[df.section_name == section]
            sections = df1.loc[labels==i]['subsection_name'].value_counts()
        else:
            sections = df.loc[labels==i]['section_name'].value_counts()
        print(f'**Sections:\n{sections}\n\n')
    
    return vectorizer, nmf

In [27]:
vec, nmf = nmf_top_words_from_topics(corpus=corpus, max_features=5000, n_topics=4, n_words=10, section=None)

*****Topic: 0*****
**Top Words:
said government state united official country american people president law

**Sections:
World           169
U.S.             97
Opinion          80
Business Day     70
Sports           20
Arts              6
Travel            1
Name: section_name, dtype: int64


*****Topic: 1*****
**Top Words:
game season team said play run sunday second year time

**Sections:
Sports          296
Arts              6
U.S.              3
Business Day      3
Opinion           3
World             1
Name: section_name, dtype: int64


*****Topic: 2*****
**Top Words:
mr said case work party president ms like program leader

**Sections:
World           75
Arts            55
U.S.            54
Business Day    23
Opinion         20
Name: section_name, dtype: int64


*****Topic: 3*****
**Top Words:
new company like ms year work york world time city

**Sections:
Opinion         121
Business Day    113
Arts            109
U.S.             36
Sports           24
World            15
M

In [29]:
section = 'World'

corpus = df[df.section_name==section]['content'].values

vec, nmf = nmf_top_words_from_topics(corpus=corpus, max_features=5000, n_topics=4, 
                                     n_words=10, section=section)

*****Topic: 0*****
**Top Words:
mr court case year crime government charge president right news

**Sections:
Europe          23
Asia Pacific    21
Middle East     13
Americas         8
Africa           5
Name: subsection_name, dtype: int64


*****Topic: 1*****
**Top Words:
attack people killed police official city group killing security government

**Sections:
Asia Pacific    29
Middle East     27
Africa          23
Europe           9
Americas         2
Name: subsection_name, dtype: int64


*****Topic: 2*****
**Top Words:
party ms election parliament government percent political european right leader

**Sections:
Europe          30
Asia Pacific    10
Africa           2
Middle East      1
Name: subsection_name, dtype: int64


*****Topic: 3*****
**Top Words:
nuclear united obama mr state nation weapon american president program

**Sections:
Middle East     33
Asia Pacific    11
Europe           3
Americas         3
Africa           1
Name: subsection_name, dtype: int64




In [30]:
section = 'Arts'

corpus = df[df.section_name==section]['content'].values

vec, nmf = nmf_top_words_from_topics(corpus=corpus, max_features=5000, n_topics=4, 
                                     n_words=10, section=section)

*****Topic: 0*****
**Top Words:
mr music song dance band work ms concert program piece

**Sections:
Music           46
Dance           19
Art & Design     4
Name: subsection_name, dtype: int64


*****Topic: 1*****
**Top Words:
art museum said gallery artist work city 000 american like

**Sections:
Art & Design    28
Music            2
Video Games      1
Name: subsection_name, dtype: int64


*****Topic: 2*****
**Top Words:
mr time 10 series episode new said television ms family

**Sections:
Television      42
Art & Design     5
Video Games      1
Name: subsection_name, dtype: int64


*****Topic: 3*****
**Top Words:
opera company production performance city house act season role year

**Sections:
Music    13
Dance     1
Name: subsection_name, dtype: int64




In [31]:
section = 'Sports'

corpus = df[df.section_name==section]['content'].values

vec, nmf = nmf_top_words_from_topics(corpus=corpus, max_features=5000, n_topics=4, 
                                     n_words=10, section=section)

*****Topic: 0*****
**Top Words:
said team season game player year like coach new play

**Sections:
Pro Football          30
Baseball              25
Pro Basketball        20
Soccer                17
Hockey                16
College Football      10
Auto Racing            7
Golf                   6
Tennis                 3
College Basketball     2
Cycling                1
Name: subsection_name, dtype: int64


*****Topic: 1*****
**Top Words:
yard quarterback game second point week defense sunday scored play

**Sections:
Pro Football        34
College Football    12
Soccer               6
Pro Basketball       4
Hockey               3
Golf                 2
Olympics             1
Rugby                1
Name: subsection_name, dtype: int64


*****Topic: 2*****
**Top Words:
cup race team club world new won san second year

**Sections:
Golf                    10
Soccer                   8
Auto Racing              7
Cycling                  4
International Sports     1
Rugby                    