# WordLemPos Data 

In [1]:
#imports
import pandas as pd
import numpy as np
from collections import Counter
import re
import csv

import gensim
from gensim.parsing.preprocessing import STOPWORDS

import warnings
warnings.filterwarnings('ignore')
# Import stopwords with nltk.
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /home/gorkem/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


By using WordLemPos data, we will try to find topics of the dataset. What we do as a first step is to check one month data for one country and see if we can find some meaningful results.
We will check October 2016, US data for this purpose. If our results seems meaningful, we will try with one year data and expand gradually from there.

In [2]:
# Read wordLemPos Data file 
data_folder='../sample_data/'
fileName= '16-10-us.txt'
wordLemPos_big = pd.read_table(data_folder+fileName, quoting=csv.QUOTE_NONE, encoding = "ISO-8859-1",header=None)

In [3]:
wordLemPos = wordLemPos_big.copy()
numRows = len(wordLemPos)

In [4]:
# check the shape and content of the data
display(wordLemPos.shape)
wordLemPos.head(15)

(17564427, 5)

Unnamed: 0,0,1,2,3,4
0,14637197,4739839025,@@14637197,,fo
1,14637197,4739839026,<p>,,
2,14637197,4739839027,NEW,new,np1
3,14637197,4739839028,YORK,york,np1
4,14637197,4739839029,(,,(
5,14637197,4739839030,AP,ap,np1
6,14637197,4739839031,),,)
7,14637197,4739839032,--,,jj_nn1
8,14637197,4739839033,Donald,donald,np1
9,14637197,4739839034,Trump,trump,nn1


What is useful for this data is the lemma-stemmed version of the words of each position. By using them we will skip the preprocessing part of the words lemmatization and stemming. Therefore, we want textID and lemma columns from the data. Also we drop the ones that are NA in lemma since it means they are either special char or numbers and don't have a root word.

In [5]:
wordLemPos.rename(columns={0:'textID',1:'ID(seq)',2:'word',3:'lemma',4:'PoS'},inplace=True)
wordLemPos = wordLemPos[['textID','lemma']]
# Drop NA values we are not interested in null lemma's since 
# they are not words but special characters and numbers
wordLemPos.dropna(inplace=True)
wordLemPos.shape
display(wordLemPos.head(10))

Unnamed: 0,textID,lemma
2,14637197,new
3,14637197,york
5,14637197,ap
8,14637197,donald
9,14637197,trump
10,14637197,'s
11,14637197,five-day
12,14637197,feud
13,14637197,with
14,14637197,a


In the WordLemPos data for each news article words are already preprocessed and lemmatized and stemmed. What we need to do first is to get rid off the stop words. 

In [6]:
#wordLemPos = wordLemPos[:numRows]
#display(wordLemPos.shape)
#display(wordLemPos.head())

To remove stopwords we have several conditions the word shouldn't be nltk or gensim.parsing stopword dictionary, it shouldn't be in unnecessary words and the length of the word should be bigger than 3. 

In [7]:
# Some unnecessary words found common but not useful for the topic modelling, we excluded them
unnecessary_words=['new','good','high','big','with','into','under',
                  'really','already','still','early','while','although','most','every','which',
                  'year','like','time','that','given','would']

In [8]:
# Find stop words, replace with nullin lemma
wordLemPos['lemma'] = wordLemPos.lemma.apply((lambda x: x if x not in (stop) 
                                              and len(x)>3 
                                              and x not in gensim.parsing.preprocessing.STOPWORDS
                                              and x not in unnecessary_words
                                              else None))
# Drop lemma null - which are the stopwords
wordLemPos.dropna(inplace=True)
display(wordLemPos.shape)
display(wordLemPos.head(10))

(6135938, 2)

Unnamed: 0,textID,lemma
3,14637197,york
8,14637197,donald
9,14637197,trump
11,14637197,five-day
12,14637197,feud
16,14637197,beauty
17,14637197,queen
21,14637197,late
22,14637197,example
25,14637197,insistence


After filtering the stopwords, we will make the words ready for the LDA.
For each unique textID we group the words and turn them into list.

In [9]:
# make list of words for each article
unique_textID = wordLemPos.textID.unique()
docs =[]
for text_id in unique_textID:
    doc = wordLemPos[wordLemPos.textID==text_id]['lemma'].tolist()
    docs.append(doc)

In [10]:
# check how many article we have
len(docs)

26028

Now we create bag of words. Assign a number key for each word. 

In [11]:
# bag of words on the dataset
dictionary = gensim.corpora.Dictionary(docs)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 30-day
1 airing
2 article
3 beauty
4 brag
5 brash
6 businessman
7 campaign
8 celebrity
9 cheer
10 combat


Another good approach is to filter the words there are not very common on the article to ease the further steps and computation. If word count is very few within the given documents then it is probably not related to find the topic of the article, so we can clear them as well.

In [12]:
# filter out the tokens that are seen less than x documents
docThreshold = 15
dictionary.filter_extremes(no_below=docThreshold, no_above=0.5, keep_n=100000)

Now we create doc2bow which is for each document we count the words they have and store them.

In [13]:
# create doc2bow
# For each doc have a dictionary stating how many words and how many times each word appears
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]
#bow_corpus[10]

We actually need also bow_doc results for our further analysis as well. Therefore, we will save it to use for later.

In [14]:
# Check 10th document which word appears how many time, print previous results
bow_doc_10 = bow_corpus[10]
#for i in range(len(bow_doc_10)):
# check first 15 words
for i in range(15):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_10[i][0], 
                                               dictionary[bow_doc_10[i][0]],bow_doc_10[i][1]))

Word 11 ("come") appears 3 time.
Word 15 ("cost") appears 2 time.
Word 19 ("example") appears 1 time.
Word 29 ("local") appears 1 time.
Word 32 ("need") appears 1 time.
Word 37 ("people") appears 1 time.
Word 38 ("period") appears 1 time.
Word 39 ("political") appears 1 time.
Word 45 ("service") appears 3 time.
Word 56 ("unfairly") appears 1 time.
Word 68 ("basically") appears 1 time.
Word 78 ("change") appears 1 time.
Word 108 ("gain") appears 1 time.
Word 115 ("happen") appears 1 time.
Word 123 ("instead") appears 1 time.


In [15]:
# to file
bow_corp = pd.DataFrame(bow_corpus)#, columns=['topic_id','words'])
#topics.to_csv('results/topics'+fileName)

In [16]:
# TF IDF scores
from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
# print first 10 tfidf scores
for doc in corpus_tfidf[:10]:
    pprint(doc)
    break

[(0, 0.13875160249185722),
 (1, 0.17715031430766318),
 (2, 0.27721028525058744),
 (3, 0.12131747497512263),
 (4, 0.12347796102459174),
 (5, 0.1891044232255576),
 (6, 0.13041958050997313),
 (7, 0.062166725248688805),
 (8, 0.11560367908079414),
 (9, 0.12807161883869791),
 (10, 0.11848732068045718),
 (11, 0.025903339805993027),
 (12, 0.2098881712441338),
 (13, 0.07310700483804931),
 (14, 0.04553721662579709),
 (15, 0.0701098956552853),
 (16, 0.06377944504270776),
 (17, 0.08063285966758814),
 (18, 0.26822299077423606),
 (19, 0.07583102922564613),
 (20, 0.15614468491195388),
 (21, 0.20544178170293612),
 (22, 0.10073535252794036),
 (23, 0.12570775136353204),
 (24, 0.16872380372463114),
 (25, 0.038929328098851024),
 (26, 0.05031869614151054),
 (27, 0.17819931852133633),
 (28, 0.050998930179784505),
 (29, 0.06073026832316018),
 (30, 0.06649222693253704),
 (31, 0.05082413449427594),
 (32, 0.1494953527134458),
 (33, 0.04375062855431459),
 (34, 0.08233282954234428),
 (35, 0.05537289918246951),
 (

As the last step, we create the LDA model to find the topics. We precised 10 topics for now and do 4 passes over the data.

In [17]:
# create the lda model with gensim providing bow_corpus and dicitonary we created
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=7, id2word=dictionary, passes=4, workers=2)

In [18]:
# Print out the find topics most common words
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.008*"school" + 0.008*"city" + 0.007*"state" + 0.006*"student" + 0.006*"work" + 0.005*"need" + 0.004*"people" + 0.004*"community" + 0.004*"county" + 0.004*"service"
Topic: 1 
Words: 0.018*"police" + 0.009*"county" + 0.008*"officer" + 0.007*"charge" + 0.006*"report" + 0.006*"case" + 0.006*"court" + 0.006*"tell" + 0.005*"department" + 0.005*"according"
Topic: 2 
Words: 0.019*"company" + 0.012*"share" + 0.009*"million" + 0.008*"market" + 0.008*"stock" + 0.008*"price" + 0.007*"business" + 0.007*"percent" + 0.007*"report" + 0.007*"quarter"
Topic: 3 
Words: 0.023*"trump" + 0.014*"clinton" + 0.009*"election" + 0.009*"state" + 0.008*"campaign" + 0.007*"republican" + 0.006*"people" + 0.006*"vote" + 0.006*"voter" + 0.005*"president"
Topic: 4 
Words: 0.007*"people" + 0.005*"family" + 0.005*"know" + 0.005*"come" + 0.004*"life" + 0.004*"want" + 0.004*"work" + 0.004*"home" + 0.003*"think" + 0.003*"love"
Topic: 5 
Words: 0.025*"game" + 0.014*"play" + 0.013*"team" + 0.011*"season" + 

In [20]:
# to file
topics = pd.DataFrame(lda_model.print_topics(-1), columns=['topic_id','words'])
topics.to_csv('../results/topics'+fileName)

As we can see from the above topics most common words, we can actually come up with good idea for news topic. For example:
- Topic 0: education
- Topic 1: crime
- Topic 2: business / economy
- Topic 3: politics / election
- Topic 4: social / life
- Topic 5: sports
- Topic 6: world/ daily / ?

Looks like we can find some good topics. We will proceed with this approach.