In [None]:
import guidedlda
import numpy as np
import os, sys, pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
counts = [
    '1',
    '2',
    '3'
]
policies = [
    'aadhar',
    'farmers',
    'demon',
    'gst'
]

In [None]:
keywords_aadhar = ['Aadhaar', 'UIDAI', 'unique identity number', 'UID', \
            'unique Aadhaar number', 'Unique Identification Authority', \
            'Adhar', 'Aadhar', 'Adhaar', 'Adharcard', 'Aadharcard', \
            'Aadhaarcard', 'Aadhar Card','Aadhar', 'Aadhaar', 'Adhar',\
            'Adharcard', 'Aadharcard', 'Aadhaarcard', 'UIDAI', 'Aadhar Card']

keywords_farmers = ['loan waiver', 'loan waivers', 'farmer loan', 'farmer suicide','farmer suicides',\
                    'pest infestation', 'farmer loans','drought','farmer', 'farmers', 'crop insurance',\
                    'Swaminathan Commission', 'National Commission on Farmer', 'kisan', 'agriculture',\
                    'monsoon failure', 'crop failure', 'fertilizers', 'Seeds Corporation', 'crop loss',\
                    'crop losses', 'unseasonal rains', 'irrigation facilities', 'debt traps',\
                   'loan waiver', 'farmer loan', 'farmer suicide', 'pest infestation', 'Swaminathan Commission',\
                    'National Commission on Farmer','kisan', 'monsoon failure', 'crop failure',\
                    'fertilizers', 'Seeds Corporation', 'farmer', 'agricultural']

keywords_demon = ['Rs 1,000 notes', 'Rs 500 notes', 'lower denomination', 'Rs 500 and Rs 1,000 notes',\
                 'demonetisation', 'denomination note', 'cash withdrawal', 'swipe machine', 'unaccounted money',\
                 'withdrawal limit', 'black money', 'long queue', 'cashless transaction', 'cashless economy',\
                 'demonitis', 'demonitiz', 'swipe machine', 'pos machine', 'fake currency', 'digital payment',\
                 'digital transaction', 'cash transaction', 'cashless economy', 'cash crunch', 'currency switch'\
                 , 'demonetised note', 'cashless transaction', 'note ban', 'currency switch','ATMs', 'now-defunct currency',\
                 'demonitis', 'demonitiz', 'denomination note', 'cash withdrawal', 'swipe machine', 'unaccounted money', 'withdrawal limit', \
                  'pos machine', 'fake currency', 'digital payment', 'digital transaction', 'cash transaction', 'cashless economy',\
                  'black money', 'cash crunch', 'currency switch', 'long queue', 'demonetised note',\
                  'cashless transaction', 'note ban', 'currency switch', 'demonetis', 'demonetiz']

keywords_gst = ['GST', 'Goods and Services Tax', 'Goods & Services Tax', 'excise duty',\
                'good and service tax', 'tax reform', 'goods and services tax', 'gst', 'taxpayers',\
               'GST', 'Goods and Services Tax', 'Goods & Services Tax', 'excise duty']

keywords_tech = ['privacy', 'cashless', 'technology', 'technological', 'innovation', 'software', 'engineering', 'high technology',\
            'technical', 'tech']

policies_dict = {"aadhar":keywords_aadhar, "demon":keywords_demon, "farmers":keywords_farmers, "gst":keywords_gst, "tech":keywords_tech}


In [None]:
folder_name = './guided-lda-count-vectors'
curr_policy_idx = 3
curr_vector_type = 2
print(policies[curr_policy_idx], curr_vector_type)

### Loading the Document-Term Matrix & Vocabulary

In [None]:
def load_vector(count, policy):
    file = open(folder_name + '/' + policy + '_X_' + str(count) + '_count.pickle', 'rb')
    X = pickle.load(file)
    return X

In [None]:
def load_vectorizer(count, policy):
    file = open(folder_name + '/' + policy + '_vectorizer_' + str(count) + '_count.pickle', 'rb')
    X = pickle.load(file)
    return X

In [None]:
X = load_vector(curr_vector_type, policies[curr_policy_idx]).toarray()
vectorizer = load_vectorizer(curr_vector_type, policies[curr_policy_idx])

In [None]:
vocab = vectorizer.vocabulary_
word2id = {}
for v in vocab:
    word2id[vocab[v]] = v

### Normal LDA

In [None]:
model_normal = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=7, refresh=20)
doc_topic_normal = model_normal.fit_transform(X)
for i in range(20):
    print("top topic: {} Document: {}".format(
            doc_topic_normal[i].argmax(), 
            ', '.join([word2id[v] for v in list(reversed(X[i,:].argsort()))[0:5]])))

### Guided LDA

In [None]:
seed_topics = {}
for keyword in policies_dict[policies[curr_policy_idx].lower()]:
    seed_topics[keyword.lower()] = 0
    seed_topics[keyword] = 0

In [None]:
model_guided = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=7, refresh=20)
model_guided.fit(X, seed_topics=seed_topics, seed_confidence=0.15)
doc_topic_guided = model_guided.transform(X)
for i in range(20):
    print("top topic: {} Document: {}".format(
            doc_topic_guided[i].argmax(), 
            ', '.join([word2id[v] for v in list(reversed(X[i,:].argsort()))[0:5]])))

### Comparison of Guided LDA v/s Normal LDA

In [None]:
def load_policy_data(policy):
    file = open('./actual-tags/' + policy + '_df.pickle', 'rb')
    X = pickle.load(file)
    return X

In [None]:
actual_labels = load_policy_data(policies[curr_policy_idx])['label']
y_pred_normal = [doc_topic_normal[i].argmax() for i in range(doc_topic_normal.shape[0])]
y_pred_guided = [doc_topic_guided[i].argmax() for i in range(doc_topic_guided.shape[0])]

In [None]:
from sklearn.metrics import accuracy_score
print('Normal: {}, Guided: {}'.format(accuracy_score(actual_labels, y_pred_normal), accuracy_score(actual_labels, y_pred_guided)))

In [None]:
print(policies[curr_policy_idx])