In [1]:
from preprocess.arpit_v2 import *
from preprocess.preprocess_v2 import *
from preprocess.preprocess_v2 import preprocess
import os
import inspect
import time
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import pickle
from models.LDA_multi_level import lda_model_multi_level
from models.LDA_single_level import lda_model_single_level
from newsplease import NewsPlease
print('------------------------------------------------------')
print('- Imports Done')

------------------------------------------------------
- Imports Done


## LDA on gold_standard.csv - Text 

### Extracting texts

In [2]:
# One way to get articles corresponding to gold-standards are

def load_obj(month, idx):
    month = str(month).zfill(2)
    idx = str(idx).zfill(5)
    with open("data/texts/{}/{}.pkl".format(month, idx), "rb") as f:
        return pickle.load(f)
    
def load_dict(month):
    month = str(month).zfill(2)
    with open("data/metadata/matching/{}.pkl".format(month), "rb") as f:
        return pickle.load(f)
    
gs = pd.read_csv('data/gold-standard/gold_standard.csv')

gs_articles = {}

for i in range(len(gs)):
    article = load_obj(gs['month'][i], gs['ids'][i])
    gs_articles[i] = article

In [3]:
print('Total articles: ' ,len(gs_articles))

Total articles:  208


In [4]:
gs_text = []
for key, value in gs_articles.items():
    gs_text.append(value.text)
gs_text = np.array(gs_text)

In [5]:
raw_docs = gs_text

In [6]:
type(raw_docs)

numpy.ndarray

In [7]:
# DATA
# Note that raw docs is a numpy array. 
# Example element is: 
# 'Logical Disk Free Space is low, Description: The disk C: on computer sjcphxstg02.strykercorp.com is running out of disk space. The values that exceeded the thre'
# data_file_string = 'short_description.pkl'

# data_file_string = 'gold_standard.csv'
# data_file = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),'data',data_file_string)
# raw_docs = pickle.load(open(data_file,'rb'))

print('- Imported Data')

- Imported Data


In [8]:
# PRE-PROCESSING
preprocess_steps_and_order = {
	'make_lowercase': [True],
	'punctuation_removal':[True],
	'whitespace_removal': [True],
	'store_alphanumeric': [False],
	'pos_removal_nltk': [True, {'pos_removal_nltk_list':['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',  'PRP','PRP$',  'RB','RBR','RBS','RP',  'JJ','JJR','JJS',   'CC','DT','EX','IN',   'WDT','WP','WP$','WRB']}],
	'tokenization_nltk': [False],
	'lemmatization_tokenization_spacy': [True],
	'stopwords_removal_nltk': [True],
	'stopwords_removal_spacy': [False],
	'make_bigrams_gensim':[True, {'make_bigrams_gensim': True, 'bigrams_min_count': 10, 'bigrams_threshold': 10}],
	'make_trigrams_gensim':[True, {'make_trigrams_gensim': True, 'trigrams_min_count': 10, 'trigrams_threshold': 10}],
	'min_max_length_removal':[False, {'min_max_length_removal': False, 'mmlr_min_len': 3, 'mmlr_max_len': 50, 'mmlr_deacc': False}]
	}

preprocess_functions = {
	'make_lowercase': make_lowercase,
	'punctuation_removal': punctuation_removal,
	'whitespace_removal': whitespace_removal,
	'store_alphanumeric': store_alphanumeric,
	'pos_removal_nltk': pos_removal_nltk,
	'tokenization_nltk': tokenization_nltk,
	'lemmatization_tokenization_spacy': lemmatization_tokenization_spacy,
	'stopwords_removal_nltk': stopwords_removal_nltk,
	'stopwords_removal_spacy': stopwords_removal_spacy,
	'make_bigrams_gensim': make_bigrams_gensim,
	'make_trigrams_gensim': make_trigrams_gensim,
	'min_max_length_removal': min_max_length_removal
	}

In [9]:
# MODELS
models_dict = {
	'LDA_single_level': lda_model_single_level,
	'LDA_multi_level': lda_model_multi_level,
}

In [10]:
# SPECIFICATIONS
specifications = {
	# 'model':'LDA_single_level', # Can be LDA_multi_level
	'level':2,
	'num_topics_list_level_1':[5,10,15,20,25,30,35,40],
	'num_topics_list_level_2':[3,5,8,11],
	'num_topics_list_level_3':[1,2,3,4,5],
	'coherence':'c_v',
	'need_best_topic': True,
	'model_selection_metric':'coherence', # or 'perplexity',
	'debug':True,
    'sample_to_print':1,
#     'pos_removal_spacy_list':['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE'],
}

In [11]:
print('*****************************************************')
print('- Starting preprocessing')
dictionary, corpus, doc_list = preprocess(
								raw_docs = raw_docs, 
								preprocess_functions = preprocess_functions, 
								preprocess_steps_and_order = preprocess_steps_and_order, 
								debug=specifications['debug'])

*****************************************************
- Starting preprocessing
--
data_sample out of  208
['6.15 am: Hardoi: Shelter home where only 2 out registered 21 women were found during an inspection on June 7 was sealed and manager Arti Agarwal was arrested yesterday. It was found that fake names were registered in the shelter home to gain donations from the administration.'
 'Recent rains in Kerala and the Malnad region of Karnataka have left a trail a destruction in India’s major spice-growing regions.\nCoupled with landslips, the rains have adversely affected plantation crops such as ginger, black pepper, cardamom coffee, nutmeg, rubber and tea. Though government agencies are yet to ascertain the quantum of loss in the plantation sector, farming sources said that the loss incurred by the sector is about ₹2,100 crore in Karnataka and ₹800 crore in Kerala.\nFarmers cultivating short term crop such as ginger were relieved when they received good pre-monsoon showers in April. Bu


       ##### POS Removal Done! Time Taken -  3.592607259750366
--
data_sample out of  208
['6 15 shelter home 2 21 women inspection june 7 manager arti agarwal yesterday names shelter home to donations administration', 'rains kerala region karnataka trail destruction spice regions landslips rains plantation crops ginger pepper cardamom coffee rubber tea government agencies to quantum loss plantation sector farming sources loss sector 100 crore karnataka ₹800 crore kerala farmers term crop ginger pre monsoon showers april ginger farms kerala karnataka submerging floods june hopes rhizomes rot disease rains ginger regions mysuru coorg chikkamagaluru hassan districts karnataka wayanad palakkad pathanamthitta districts kerala june fields lakh to ginger 10 acres land i can quarter expense ” manjunath farmer kushalnagar coorg rains ginger rhizomes 5 000 hectares karnataka 2 500 hectares kerala mohanan president ginger growers’ association impact rains pepper vines parts two states will prod


       ##### Tri-Grams made using Gensim! Time Taken -  0.1571059226989746
~~~ pre-processing done in  9.728078126907349
 
- Creating dictionary and corpus


In [20]:
print('*****************************************************')
print('- Starting model training')
lda_dict = lda_model_single_level(
					dictionary = dictionary,
					corpus = corpus,
					doc_list = doc_list,
					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
					coherence = specifications['coherence'],
					debug = specifications['debug'],
					need_best_topic = specifications['need_best_topic'],
					model_selection_metric = specifications['model_selection_metric']
					)

*****************************************************
- Starting model training
 
Sample data point:  ['shelter', 'woman', 'inspection', 'june', 'manager', 'arti', 'agarwal', 'yesterday', 'name', 'shelter', 'donation', 'administration']
 
	### Running LDA for number of topic - 5
	LDA Done for 5 topic! Time Taken is 1.8078358173370361
	Evaluating model for number of topic - 5
Coherence - 0.3474074434689314, Perplexity - -7.994466043093748
---
	### Running LDA for number of topic - 10
	LDA Done for 10 topic! Time Taken is 1.5824739933013916
	Evaluating model for number of topic - 10
Coherence - 0.46112388375036006, Perplexity - -8.112409870481969
---
	### Running LDA for number of topic - 15
	LDA Done for 15 topic! Time Taken is 1.6985406875610352
	Evaluating model for number of topic - 15
Coherence - 0.40834543502461085, Perplexity - -9.683901367367854
---
	### Running LDA for number of topic - 20
	LDA Done for 20 topic! Time Taken is 1.8599929809570312
	Evaluating model for number of t

In [22]:
pyLDAvis.enable_notebook()
print(lda_dict['best_topic'])
print(lda_dict['coherence_score'])
print(lda_dict['perplexity_score'])
visualization = pyLDAvis.gensim.prepare(lda_dict['best_lda_model'], lda_dict['corpus'], lda_dict['dictionary'])
# pyLDAvis.save_html(visualization, 'lda_10.html')
visualization

10
0.46112388375036006
-8.112409870481969


In [34]:
def get_topics(lda_model):
	topics = lda_model.print_topics()
	print("~~~ Topics are:")
	for i in range(len(topics)):
		print('Topic ',i)
		print(topics[i][1])
		print(' ')

In [35]:
get_topics(lda_dict['best_lda_model'])

~~~ Topics are:
Topic  0
0.050*"mine" + 0.042*"encroachment" + 0.041*"meghalaya" + 0.040*"resident" + 0.039*"miner" + 0.039*"tragedy" + 0.039*"diver" + 0.038*"debt" + 0.037*"effort" + 0.037*"airlift"
 
Topic  1
0.082*"land" + 0.074*"acquisition" + 0.073*"government" + 0.067*"court" + 0.022*"march" + 0.016*"cm" + 0.013*"protest" + 0.012*"gujarat" + 0.010*"tamil" + 0.010*"demand"
 
Topic  2
0.099*"india" + 0.045*"coal" + 0.043*"miner" + 0.041*"leopard" + 0.040*"body" + 0.040*"highway" + 0.039*"monk" + 0.037*"forest" + 0.037*"effort" + 0.037*"serviceman"
 
Topic  3
0.030*"crisis" + 0.019*"water" + 0.017*"dam" + 0.015*"govt" + 0.014*"cauvery" + 0.013*"protest" + 0.012*"farm" + 0.010*"aiadmk" + 0.010*"construction" + 0.009*"session"
 
Topic  4
0.062*"violence" + 0.053*"slaughter" + 0.045*"team" + 0.045*"onion" + 0.018*"cow" + 0.015*"day" + 0.015*"bulandshahr" + 0.012*"man" + 0.010*"death" + 0.009*"bjp"
 
Topic  5
0.076*"maharashtra" + 0.073*"issue" + 0.066*"ride" + 0.021*"centre" + 0.011*"p

In [18]:
# print('*****************************************************')
# print('- Starting model training')
# lda_level_1, lda_level_2 = lda_model_multi_level(
# 					level = specifications['level'],
# 					dictionary = dictionary,
# 					corpus = corpus,
# 					doc_list = doc_list,
# 					coherence = specifications['coherence'],
# 					debug = specifications['debug'],
# 					need_best_topic = specifications['need_best_topic'],
# 					model_selection_metric = specifications['model_selection_metric'],
# 					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
# 					num_topics_list_level_2 = specifications['num_topics_list_level_2'], 
# 					)

In [None]:
'''
Each bubble on the left-hand side plot represents a topic. 
The larger the bubble, the more prevalent is that topic.

A good topic model will have fairly big, non-overlapping bubbles scattered 
throughout the chart instead of being clustered in one quadrant.

A model with too many topics, will typically have many overlaps, 
small sized bubbles clustered in one region of the chart.

Alright, if you move the cursor over one of the bubbles, the words and bars 
on the right-hand side will update. These words are the salient keywords that form the selected topic.
'''