# Commands

In [None]:
# oomstopper --no-tail nmf-cache ; killbill nmf-cache ; cd ~/twinews-logs ; jupython -o nohup-nmf-cache-$HOSTNAME.out --venv st-venv ~/Workspace/Python/Datasets/Twinews/twinews/models/nmf-cache.ipynb

# Init

In [1]:
import os ; os.environ["CUDA_VISIBLE_DEVICES"] = ""

In [2]:
isNotebook = '__file__' not in locals()

In [3]:
TEST = isNotebook # isNotebook, True, False

In [4]:
from systemtools.hayj import *
from systemtools.location import *
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from nlptools.preprocessing import *
from nlptools.basics import *
from twinews.utils import *
from twinews.models.ranking import *

In [5]:
from nltk.stem import WordNetLemmatizer 
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import LdaMulticore
from gensim.test.utils import common_corpus, common_dictionary
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import gensim

In [6]:
logger = Logger(tmpDir('logs') + "/nmf-cache.log") if isNotebook else Logger("nmf-cache-" + getHostname() + ".log")
tt = TicToc(logger=logger)
tt.tic()

--> tictoc starts...


-1

# Config

In [7]:
config = \
{
    'maxDocuments': 300 if TEST else 300000,
    'useExtraNews': False if TEST else True, # None = unlimited, 0 = no extra news
    'minDF': 1 / 500 if TEST else 1 / 2000, # Remove words that have a document frequency ratio lower than 1 / 500
    'maxDF': 300, # Remove top 300 voc elements
    
    'nbTopics': 30 if TEST else 100, # 30, 100
    'lowercase': True if TEST else True,
    'doLemmatization': False if TEST else False,
    
    'maxIter': 2 if TEST else 200, # 30 for lda, 200 for nmf
    
    'nmfInit': 'nndsvd', # None, 'nndsvd'
    'nmfL1Ratio': 0, # 0.0, 0.5, 1.0
    'nmfAlpha': 0.1, # 0.0, 0.1
}

# Getting data

In [8]:
# We get urls for the LDA model:
newsCollection = getNewsCollection()
urlsForModel = shuffle(list(newsCollection.distinct('url')), seed=0)
urlsForModel = urlsForModel[:config['maxDocuments']]

twinews news (version 1.0) initialised.


In [9]:
# We get sentences:
sentences = getNewsSentences(urlsForModel, logger=logger)
bp(sentences, logger)

twinews news (version 1.0) initialised.
  0% [                    ]
 10% [==                  ] (11.97s left)
 20% [====                ] (9.96s left)
[
  [ [ __int_2__, Things, ..., here, ! ], [ We, hope, ..., recommend, ! ], ..., [ Available, in, ..., queen, . ], [ How, you, ..., everyone, . ] ],
  [ [ Everyone, has, ..., stories, . ], [ From, Nov., ..., talent, . ], ..., [ Christ, Church, ..., storytellers, . ], [ The, evening, ..., dancing, . ] ],
  ...,
  [ [ Reasonable, people, ..., Khadr, . ], [ How, much, ..., worth, ? ], ..., [ The, Harper, ..., then, . ], [ So, is, ..., now, . ] ],
  [ [ White, Nationalists, ..., accounts, . ], [ Written, By, ..., accounts, . ], ..., [ Since, Airbnb, ..., discrimination, . ], [ The, company, ..., commitment, . ] ]
]


In [10]:
# We flatten sentences:
for i in range(len(sentences)):
    sentences[i] = flattenLists(sentences[i])
docs = sentences
bp(docs, logger)

[ [ __int_2__, Things, ..., everyone, . ], [ Everyone, has, ..., dancing, . ], ..., [ Reasonable, people, ..., now, . ], [ White, Nationalists, ..., commitment, . ] ]


In [11]:
# Lower case:
if config['lowercase']:
    for i in pb(list(range(len(docs))), logger=logger, message="Lower casing"):
        for u in range(len(docs[i])):
            docs[i][u] = docs[i][u].lower()
bp(docs, logger)

Lower casing   0% [                    ]
Lower casing  10% [==                  ] (0.09s left)
Lower casing  20% [====                ] (0.079s left)
[ [ __int_2__, things, ..., everyone, . ], [ everyone, has, ..., dancing, . ], ..., [ reasonable, people, ..., now, . ], [ white, nationalists, ..., commitment, . ] ]


In [12]:
# Lemmatization:
if config['doLemmatization']:
    lemmatizer = WordNetLemmatizer()
    pbar = ProgressBar(len(docs), logger=logger, message="Lemmatization")
    for i in range(len(docs)):
        for u in range(len(docs[i])):
            docs[i][u] = lemmatizer.lemmatize(docs[i][u])
        pbar.tic()
bp(docs, logger)

[ [ __int_2__, things, ..., everyone, . ], [ everyone, has, ..., dancing, . ], ..., [ reasonable, people, ..., now, . ], [ white, nationalists, ..., commitment, . ] ]


In [13]:
# Filtering the corpus:
docs = filterCorpus(docs, minDF=config['minDF'], maxDF=config['maxDF'],
                    removeEmptyDocs=False, allowEmptyDocs=False, logger=logger)
for doc in docs: assert len(doc) > 0
bp(docs, logger)

Voc removed because of maxDF (300 elements):
{ ", ', (, ), ,, -, ., ..., :, ;, ..., without, work, working, world, would, year, years, yet, you, your }
1.48% of voc will be removed.
[ [ room, comfortable, ..., describe, everyone ], [ everyone, unique, ..., storytelling, dancing ], ..., [ reasonable, debate, ..., harper, trudeau ], [ white, nationalists, ..., breach, commitment ] ]


In [14]:
tt.tic("Data preprocessed")

--> tic: 15m 17.32s | message: Data preprocessed


917.32

# Infering topic vectors

In [15]:
vectorizer = TfidfVectorizer\
(
    sublinear_tf=True,
    tokenizer=lambda x: x,
    preprocessor=lambda x: x,
    # lowercase=True, # Doesn't work because we erased preprocessor
)
vectorsForModel = vectorizer.fit_transform(docs)
assert vectorsForModel.shape[0] == config['maxDocuments']
model = NMF\
(
    n_components=config['nbTopics'],
    random_state=0,
    alpha=config['nmfAlpha'],
    l1_ratio=config['nmfL1Ratio'],
    init=config['nmfInit'],
    max_iter=config['maxIter'],
)
model.fit(vectorsForModel)
topics = []
feature_names = vectorizer.get_feature_names()
for topic_idx, topic in enumerate(model.components_):
    wordProb = []
    for i in range(len(topic)):
        prob = topic[i]
        word = feature_names[i]
        wordProb.append((word, prob))
    wordProb = sortBy(wordProb, desc=True, index=1)[:100]
    current = dict()
    for word, prob in wordProb:
        current[word] = prob
    topics.append(current)

In [16]:
tt.tic("Model fitted and topic vectors infered")

--> tic: 1.36s | message: Model fitted and topic vectors infered


1.36

# Showing topics

In [17]:
def printTopics(topics, maxWords=10, logger=None):
    for i in range(len(topics)):
        log(str(i) + ": " + str(" ".join(list(topics[i].keys())[:10])), logger)

In [18]:
def printTopicsOf(vector, topics, logger=None):
    topicsRepr = ""
    topTopics = sortBy([(i, score) for i, score in enumerate(vector) if score > 0.001], desc=True, index=1)[:3]
    log("Top topics number are: " + str(" ".join([str(e[0]) for e in topTopics])), logger)
    currentTopics = [topics[e[0]] for e in topTopics]
    printTopics(currentTopics, logger=logger)

In [19]:
printTopics(topics, logger=logger)

0: ! yes someone kids thought talk church students feel tell
1: tax republicans senate bill republican gop democrats vote r sen.
2: technology companies management platform businesses services customer strategy cloud industry
3: ! blue duke __netloc__ miami game tickets night win front
4: turkey turkish syrian military kurdish russia syria erdogan foreign ankara
5: court county defendants attorney lawyer lawsuit drug justice legal filed
6: game points scored games team coach hockey season player win
7: mccarthy intelligence clinton documents email russian nsa kaspersky lauren emails
8: medical health study disease researchers activity brain mental hypothesis rhythms
9: hospital schools care martin programs medical housing history students development
10: water lake gallons dupage communities chicago county michigan systems exports
11: franken sexual sorry women al canada broadcaster tweeden experienced abuse
12: chamber startup entrepreneurs area commerce incubator bugg fund chattanoog

# Infering and caching data

In [None]:
from twinews.models.genericutils import getGenericCache, genericFields
if TEST:
    cache = getGenericCache("nmf-test")
else:
    cache = getGenericCache("nmf")
field = genericFields['nmf']

In [None]:
ids = list(newsCollection.distinct("_id"))

In [None]:
if TEST:
    ids = ids[:1000]

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
for currentId in pb(ids, logger=logger, printRatio=0.01, verbose=not TEST):
    row = newsCollection.findOne({"_id": currentId}, projection={field: True})
    sentences = row[field]
    theHash = objectToHash(sentences)
    # We flatten sentences:
    doc = flattenLists(sentences)
    # We lowercase:
    if config['lowercase']:
        for i in range(len(doc)):
            doc[i] = doc[i].lower()
    # We lemmatize:
    if config['doLemmatization']:
        for i in range(len(doc)):
            doc[i] = lemmatizer.lemmatize(doc[i])
    # We vectorize it:
    vectors = vectorizer.transform(np.array([doc]))
    # We get topics:
    topicRepr = model.transform(vectors)[0]
    # We print the doc:
    if TEST:
        bp(doc, logger)
        log(theHash, logger)
        printTopicsOf(topicRepr, topics, logger=logger)
    # We cache it:
    cache[theHash] = topicRepr