In [1]:
from preprocess.arpit_v2 import *
from preprocess.preprocess_v2 import *
from preprocess.preprocess_v2 import preprocess
import os
import inspect
import time
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import pickle
from models.LDA_multi_level import lda_model_multi_level
from models.LDA_single_level import lda_model_single_level
print('------------------------------------------------------')
print('- Imports Done')

------------------------------------------------------
- Imports Done


## LDA on gold_standard.csv - Title 

In [2]:
data_file_string = 'gold_standard.csv'
data_file = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),'data',data_file_string)
gold_standard = pd.read_csv(data_file)

print(gold_standard['class'].value_counts())
print('---')
print(gold_standard['month'].value_counts())

positive    208
Name: class, dtype: int64
---
12    79
10    46
11    43
9     20
8     20
Name: month, dtype: int64


In [3]:
gold_standard.head()

Unnamed: 0,ids,month,class,title
0,2055,8,positive,News Alert! Maratha Reservation: Bicycles set ...
1,5909,8,positive,"Post-rains, plantation sector tots up losses"
2,2525,8,positive,Can't free Rajiv Gandhi's killers: Indian Govt...
3,3175,8,positive,Organic farming policy on the anvil
4,1164,8,positive,MP: 3 detained after 10 dead cows found in aba...


In [4]:
raw_docs = gold_standard['title'].values

In [5]:
type(raw_docs)

numpy.ndarray

In [6]:
# DATA
# Note that raw docs is a numpy array. 
# Example element is: 
# 'Logical Disk Free Space is low, Description: The disk C: on computer sjcphxstg02.strykercorp.com is running out of disk space. The values that exceeded the thre'
# data_file_string = 'short_description.pkl'

# data_file_string = 'gold_standard.csv'
# data_file = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),'data',data_file_string)
# raw_docs = pickle.load(open(data_file,'rb'))

print('- Imported Data')

- Imported Data


In [7]:
# PRE-PROCESSING
preprocess_steps_and_order = {
	'make_lowercase': [True],
	'punctuation_removal':[True],
	'whitespace_removal': [True],
	'store_alphanumeric': [False],
	'pos_removal_nltk': [True],
	'tokenization_nltk': [False],
	'lemmatization_tokenization_spacy': [True],
	'stopwords_removal_nltk': [True],
	'stopwords_removal_spacy': [False],
	'make_bigrams_gensim':[True, {'make_bigrams_gensim': True, 'bigrams_min_count': 10, 'bigrams_threshold': 10}],
	'make_trigrams_gensim':[True, {'make_trigrams_gensim': True, 'trigrams_min_count': 10, 'trigrams_threshold': 10}],
	'min_max_length_removal':[False, {'min_max_length_removal': False, 'mmlr_min_len': 3, 'mmlr_max_len': 50, 'mmlr_deacc': False}]
	}

preprocess_functions = {
	'make_lowercase': make_lowercase,
	'punctuation_removal': punctuation_removal,
	'whitespace_removal': whitespace_removal,
	'store_alphanumeric': store_alphanumeric,
	'pos_removal_nltk': pos_removal_nltk,
	'tokenization_nltk': tokenization_nltk,
	'lemmatization_tokenization_spacy': lemmatization_tokenization_spacy,
	'stopwords_removal_nltk': stopwords_removal_nltk,
	'stopwords_removal_spacy': stopwords_removal_spacy,
	'make_bigrams_gensim': make_bigrams_gensim,
	'make_trigrams_gensim': make_trigrams_gensim,
	'min_max_length_removal': min_max_length_removal
	}

In [8]:
# MODELS
models_dict = {
	'LDA_single_level': lda_model_single_level,
	'LDA_multi_level': lda_model_multi_level,
}

In [30]:
# SPECIFICATIONS
specifications = {
	# 'model':'LDA_single_level', # Can be LDA_multi_level
	'level':2,
	'num_topics_list_level_1':[6],
	'num_topics_list_level_2':[3,5,8,11],
	'num_topics_list_level_3':[1,2,3,4,5],
	'coherence':'c_v',
	'need_best_topic': True,
	'model_selection_metric':'coherence', # or 'perplexity',
	'debug':False,
}

In [31]:
print('*****************************************************')
print('- Starting preprocessing')
dictionary, corpus, doc_list = preprocess(
								raw_docs = raw_docs, 
								preprocess_functions = preprocess_functions, 
								preprocess_steps_and_order = preprocess_steps_and_order, 
								debug=specifications['debug'])

*****************************************************
- Starting preprocessing

       ##### Lowercasing Done! Time Taken -  0.00023698806762695312

       ##### Punctuation removed! Time Taken -  0.0010907649993896484

       ##### Whitespace removed! Time Taken -  0.0003609657287597656

       ##### POS Removal Done! Time Taken -  0.15134119987487793

       ##### Lemmatization and Tokenization Done using Spacy! Time Taken -  1.1902620792388916

       ##### Stopwords Removed using NLTK! Time Taken -  0.0011000633239746094

       ##### Bi-Grams made using Gensim! Time Taken -  0.005740165710449219

       ##### Tri-Grams made using Gensim! Time Taken -  0.005240917205810547
~~~ pre-processing done in  1.3567628860473633
 
- Creating dictionary and corpus


In [32]:
print('*****************************************************')
print('- Starting model training')
lda_dict = lda_model_single_level(
					dictionary = dictionary,
					corpus = corpus,
					doc_list = doc_list,
					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
					coherence = specifications['coherence'],
					debug = specifications['debug'],
					need_best_topic = specifications['need_best_topic'],
					model_selection_metric = specifications['model_selection_metric']
					)

*****************************************************
- Starting model training
 
Sample data point:  ['news', 'alert', 'maratha', 'reservation', 'bicycle', 'protester', 'pune', '’s', 'kothrud']
 
	### Running LDA for number of topic - 6
	LDA Done for 6 topic! Time Taken is 0.2998011112213135
	Evaluating model for number of topic - 6
Coherence - 0.6140946021346061, Perplexity - -7.271138267923187
---
- Done training model on all topics in 0.6394569873809814 sec!
Done Single-Level LDA


In [33]:
pyLDAvis.enable_notebook()
print(lda_dict['best_topic'])
print(lda_dict['coherence_score'])
print(lda_dict['perplexity_score'])
visualization = pyLDAvis.gensim.prepare(lda_dict['best_lda_model'], lda_dict['corpus'], lda_dict['dictionary'])
# pyLDAvis.save_html(visualization, 'lda_10.html')
visualization

6
0.6140946021346061
-7.271138267923187


In [34]:
def get_topics(lda_model):
	topics = lda_model.print_topics()
	print("~~~ Topics are:")
	for i in range(len(topics)):
		print('Topic ',i)
		print(topics[i][1])
		print(' ')

In [35]:
get_topics(lda_dict['best_lda_model'])

~~~ Topics are:
Topic  0
0.050*"mine" + 0.042*"encroachment" + 0.041*"meghalaya" + 0.040*"resident" + 0.039*"miner" + 0.039*"tragedy" + 0.039*"diver" + 0.038*"debt" + 0.037*"effort" + 0.037*"airlift"
 
Topic  1
0.082*"land" + 0.074*"acquisition" + 0.073*"government" + 0.067*"court" + 0.022*"march" + 0.016*"cm" + 0.013*"protest" + 0.012*"gujarat" + 0.010*"tamil" + 0.010*"demand"
 
Topic  2
0.099*"india" + 0.045*"coal" + 0.043*"miner" + 0.041*"leopard" + 0.040*"body" + 0.040*"highway" + 0.039*"monk" + 0.037*"forest" + 0.037*"effort" + 0.037*"serviceman"
 
Topic  3
0.030*"crisis" + 0.019*"water" + 0.017*"dam" + 0.015*"govt" + 0.014*"cauvery" + 0.013*"protest" + 0.012*"farm" + 0.010*"aiadmk" + 0.010*"construction" + 0.009*"session"
 
Topic  4
0.062*"violence" + 0.053*"slaughter" + 0.045*"team" + 0.045*"onion" + 0.018*"cow" + 0.015*"day" + 0.015*"bulandshahr" + 0.012*"man" + 0.010*"death" + 0.009*"bjp"
 
Topic  5
0.076*"maharashtra" + 0.073*"issue" + 0.066*"ride" + 0.021*"centre" + 0.011*"p

In [18]:
# print('*****************************************************')
# print('- Starting model training')
# lda_level_1, lda_level_2 = lda_model_multi_level(
# 					level = specifications['level'],
# 					dictionary = dictionary,
# 					corpus = corpus,
# 					doc_list = doc_list,
# 					coherence = specifications['coherence'],
# 					debug = specifications['debug'],
# 					need_best_topic = specifications['need_best_topic'],
# 					model_selection_metric = specifications['model_selection_metric'],
# 					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
# 					num_topics_list_level_2 = specifications['num_topics_list_level_2'], 
# 					)