In [26]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import numpy as np
from sklearn.model_selection import train_test_split

### 1. Load Data

In [15]:
def load_data(relevant_path,irrelevant_path):
    data_relevant = pd.read_json(relevant_path);
    data_irrelevant = pd.read_json(irrelevant_path)
    data_lda_train = pd.concat([data_relevant,data_irrelevant],axis=0)[['content','label']]
    data_text = pd.concat([data_relevant,data_irrelevant],axis=0)[['content','label']]
    data_text['index'] = data_text.index
    return data_text

In [16]:
documents = load_data("../../data/clean/relevant_news.json","../../data/clean/irrelevant_news.json")

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/tharun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2. Pre-processing
We convert the content to list of tokens

In [33]:
def preprocess(text):
    """
    Function to tokenize the text
    """
    result = []
    for token in gensim.utils.simple_preprocess(text):
        result.append(token)
    return result

In [18]:
processed_docs = documents['content'].map(preprocess)
processed_docs[:10]

In [21]:
len(processed_docs)

20000

### 3. Create Test-Train Split for LDA

In [29]:
## Create test train split
def test_train_split(dataframe,size):
    label = [[0]]*10000
    label.extend([[1]]*10000)
    return train_test_split(processed_docs, label, test_size=0.2, random_state=42, shuffle = True)

In [30]:
X_train, X_test, Y_train, Y_test = test_train_split(processed_docs,10000)

### 4. Bag of words (BoW)
Generate bag of words feature set for LDA

In [31]:
dictionary = gensim.corpora.Dictionary(processed_docs)

#### Sample terms in dictionary

In [34]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 account
1 add
2 adviser
3 alias
4 angry
5 asian
6 association
7 banana
8 bellwether
9 benjamin
10 black


### Filter extremes from dataset

In [35]:
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

#### Generate train and test corpus for BoW

In [36]:
bow_corpus = [dictionary.doc2bow(doc) for doc in X_train]
bow_test_corpus = [dictionary.doc2bow(doc) for doc in X_test]

#### Sample BoW document

In [38]:
bow_doc_4310 = bow_corpus[0]

for i in range(10):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))

Word 18 ("camp") appears 2 time.
Word 27 ("city") appears 2 time.
Word 106 ("press") appears 2 time.
Word 113 ("request") appears 1 time.
Word 218 ("days") appears 1 time.
Word 226 ("drive") appears 3 time.
Word 257 ("game") appears 2 time.
Word 349 ("start") appears 1 time.
Word 462 ("circle") appears 1 time.
Word 601 ("field") appears 2 time.


### 5. TF-IDF
Generate TF-IDF scores from bow_corpus

In [39]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

In [45]:
from pprint import pprint
corpus_tfidf = tfidf[bow_corpus]

### 6. LDA model using bag of words, 5 topics

In [46]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=5, id2word=dictionary, passes=2, workers=2)

In [47]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.006*"pm" + 0.006*"people" + 0.004*"police" + 0.004*"nt" + 0.003*"st" + 0.003*"city" + 0.003*"tell" + 0.003*"family" + 0.003*"day" + 0.002*"house"
Topic: 1 
Words: 0.008*"game" + 0.006*"play" + 0.006*"team" + 0.006*"nt" + 0.005*"season" + 0.003*"company" + 0.003*"start" + 0.003*"win" + 0.002*"coach" + 0.002*"market"
Topic: 2 
Words: 0.009*"company" + 0.005*"service" + 0.004*"report" + 0.003*"market" + 0.003*"project" + 0.003*"share" + 0.003*"people" + 0.003*"bank" + 0.003*"city" + 0.003*"plan"
Topic: 3 
Words: 0.011*"quarter" + 0.010*"company" + 0.007*"share" + 0.006*"business" + 0.006*"market" + 0.005*"net" + 0.005*"continue" + 0.005*"statements" + 0.005*"growth" + 0.004*"increase"
Topic: 4 
Words: 0.012*"market" + 0.005*"trump" + 0.005*"president" + 0.005*"table" + 0.004*"government" + 0.004*"people" + 0.004*"report" + 0.003*"minister" + 0.003*"party" + 0.003*"police"


### LDA Model using bag of words 10 topics

In [48]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.009*"market" + 0.007*"business" + 0.007*"service" + 0.007*"company" + 0.006*"growth" + 0.005*"quarter" + 0.005*"technology" + 0.004*"global" + 0.004*"continue" + 0.004*"data"
Topic: 1 
Words: 0.016*"quarter" + 0.010*"net" + 0.009*"share" + 0.008*"company" + 0.007*"cash" + 0.007*"income" + 0.006*"operate" + 0.006*"continue" + 0.006*"increase" + 0.006*"financial"
Topic: 2 
Words: 0.017*"company" + 0.008*"market" + 0.007*"share" + 0.006*"statements" + 0.005*"forwardlooking" + 0.005*"quarter" + 0.004*"price" + 0.004*"business" + 0.004*"offer" + 0.004*"release"
Topic: 3 
Words: 0.028*"market" + 0.017*"table" + 0.007*"share" + 0.007*"analysis" + 0.006*"type" + 0.005*"company" + 0.005*"historic" + 0.005*"city" + 0.004*"sales" + 0.004*"pm"
Topic: 4 
Words: 0.007*"trump" + 0.006*"people" + 0.006*"nt" + 0.005*"president" + 0.003*"tell" + 0.003*"house" + 0.002*"campaign" + 0.002*"court" + 0.002*"party" + 0.002*"time"
Topic: 5 
Words: 0.004*"company" + 0.004*"people" + 0.004*"he

In [56]:
lda_model.save("../../models/lda_bow")

### 7. LDA model using TF-IDF, 5 topics

In [49]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=5, id2word=dictionary, passes=4, workers=2)

In [50]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('',topic)
    print("\n")

 0.004*"company" + 0.004*"market" + 0.003*"quarter" + 0.003*"share" + 0.002*"statements" + 0.002*"forwardlooking" + 0.002*"business" + 0.002*"growth" + 0.002*"financial" + 0.002*"products"


 0.003*"game" + 0.003*"play" + 0.002*"season" + 0.002*"zacks" + 0.002*"pm" + 0.002*"nt" + 0.002*"team" + 0.002*"film" + 0.002*"coach" + 0.002*"win"


 0.003*"trump" + 0.002*"government" + 0.002*"president" + 0.002*"minister" + 0.002*"people" + 0.001*"house" + 0.001*"party" + 0.001*"city" + 0.001*"china" + 0.001*"percent"


 0.009*"police" + 0.003*"arrest" + 0.003*"russian" + 0.003*"officer" + 0.002*"court" + 0.002*"india" + 0.002*"suspect" + 0.002*"charge" + 0.002*"moscow" + 0.002*"delhi"


 0.005*"icra" + 0.003*"bond" + 0.003*"mln" + 0.002*"certificate" + 0.002*"bjp" + 0.002*"issue" + 0.002*"deposit" + 0.002*"dec" + 0.002*"interbank" + 0.002*"bln"




### LDA model suing TF-IDF 10 topics

In [51]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=4, workers=10)
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))
    print('\n')

Topic: 0 Word: 0.003*"police" + 0.002*"pm" + 0.001*"school" + 0.001*"city" + 0.001*"flight" + 0.001*"boeing" + 0.001*"county" + 0.001*"nt" + 0.001*"company" + 0.001*"friday"


Topic: 1 Word: 0.002*"trump" + 0.002*"game" + 0.002*"percent" + 0.002*"china" + 0.001*"tariff" + 0.001*"play" + 0.001*"trade" + 0.001*"india" + 0.001*"nt" + 0.001*"season"


Topic: 2 Word: 0.003*"trump" + 0.002*"nt" + 0.001*"president" + 0.001*"quarter" + 0.001*"house" + 0.001*"people" + 0.001*"company" + 0.001*"police" + 0.001*"bank" + 0.001*"report"


Topic: 3 Word: 0.004*"company" + 0.003*"statements" + 0.003*"market" + 0.003*"forwardlooking" + 0.003*"quarter" + 0.002*"share" + 0.002*"financial" + 0.002*"business" + 0.002*"products" + 0.002*"service"


Topic: 4 Word: 0.002*"minister" + 0.001*"court" + 0.001*"police" + 0.001*"government" + 0.001*"people" + 0.001*"trump" + 0.001*"nt" + 0.001*"game" + 0.001*"pm" + 0.001*"president"


Topic: 5 Word: 0.002*"game" + 0.002*"nt" + 0.002*"season" + 0.002*"play" + 0.001

In [57]:
lda_model_tfidf.save("../../models/lda_tfidf")

### 8. Results for sample document

In [58]:
for index, score in sorted(lda_model[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 5)))


Score: 0.49756285548210144	 
Topic: 0.007*"trump" + 0.006*"people" + 0.006*"nt" + 0.005*"president" + 0.003*"tell"

Score: 0.4707499146461487	 
Topic: 0.009*"police" + 0.006*"team" + 0.004*"county" + 0.004*"city" + 0.003*"play"

Score: 0.029671287164092064	 
Topic: 0.007*"people" + 0.005*"report" + 0.005*"company" + 0.004*"share" + 0.004*"president"


### 9. Generate LDA features for train and test

In [61]:
features_train = []

In [64]:
for doc in range(16000):
    row=np.zeros(10)
    for index, score in lda_model[bow_corpus[doc]]:
        row[index] = score
    features_train.append(row)

In [67]:
features_test = []

In [68]:
for doc in range(10):
    row=np.zeros(10)
    for index, score in lda_model[bow_test_corpus[doc]]:
        row[index] = score
    features_test.append(row)

In [69]:
len(Y_train)

16000

### 10. Dump features onto file

In [66]:
y = np.array(Y_train).reshape(-1)
features_train = pd.concat([pd.DataFrame(features_train),pd.Series(y).rename('label')],axis=1)
features_train.head(5)
features_train.head(5).to_csv("../../data/feature/lda_features_train.csv")

In [71]:
y = np.array(Y_test).reshape(-1)
features_test = pd.concat([pd.DataFrame(features_test),pd.Series(y).rename('label')],axis=1)
features_test.head(5).to_csv("../../data/feature/lda_features_test.csv")