In [1]:
from preprocess.arpit_v2 import *
from preprocess.preprocess_v2 import *
from preprocess.preprocess_v2 import preprocess
import os
import inspect
import time
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import pickle
from models.LDA_multi_level import lda_model_multi_level
from models.LDA_single_level import lda_model_single_level
print('------------------------------------------------------')
print('- Imports Done')

------------------------------------------------------
- Imports Done


In [2]:
# DATA
# Note that raw docs is a numpy array. 
# Example element is: 
# 'Logical Disk Free Space is low, Description: The disk C: on computer sjcphxstg02.strykercorp.com is running out of disk space. The values that exceeded the thre'
# data_file_string = 'short_description.pkl'
data_file_string = 'data.pkl'
data_file = os.path.join(os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))),'data',data_file_string)
raw_docs = pickle.load(open(data_file,'rb'))
print('- Imported Data')

- Imported Data


In [3]:
# PRE-PROCESSING
preprocess_steps_and_order = {
	'make_lowercase': [True],
	'punctuation_removal':[True],
	'whitespace_removal': [True],
	'store_alphanumeric': [False],
	'pos_removal_nltk': [True],
	'tokenization_nltk': [False],
	'lemmatization_tokenization_spacy': [True],
	'stopwords_removal_nltk': [True],
	'stopwords_removal_spacy': [False],
	'make_bigrams_gensim':[True, {'make_bigrams_gensim': True, 'bigrams_min_count': 10, 'bigrams_threshold': 10}],
	'make_trigrams_gensim':[True, {'make_trigrams_gensim': True, 'trigrams_min_count': 10, 'trigrams_threshold': 10}],
	'min_max_length_removal':[False, {'min_max_length_removal': False, 'mmlr_min_len': 3, 'mmlr_max_len': 50, 'mmlr_deacc': False}]
	}

preprocess_functions = {
	'make_lowercase': make_lowercase,
	'punctuation_removal': punctuation_removal,
	'whitespace_removal': whitespace_removal,
	'store_alphanumeric': store_alphanumeric,
	'pos_removal_nltk': pos_removal_nltk,
	'tokenization_nltk': tokenization_nltk,
	'lemmatization_tokenization_spacy': lemmatization_tokenization_spacy,
	'stopwords_removal_nltk': stopwords_removal_nltk,
	'stopwords_removal_spacy': stopwords_removal_spacy,
	'make_bigrams_gensim': make_bigrams_gensim,
	'make_trigrams_gensim': make_trigrams_gensim,
	'min_max_length_removal': min_max_length_removal
	}

In [4]:
# MODELS
models_dict = {
	'LDA_single_level': lda_model_single_level,
	'LDA_multi_level': lda_model_multi_level,
}

In [5]:
# SPECIFICATIONS
specifications = {
	# 'model':'LDA_single_level', # Can be LDA_multi_level
	'level':2,
	'num_topics_list_level_1':[5],
	'num_topics_list_level_2':[3],
	'num_topics_list_level_3':[1,2,3,4,5],
	'coherence':'c_v',
	'need_best_topic': True,
	'model_selection_metric':'coherence', # or 'perplexity',
	'debug':False,
}

In [6]:
print('*****************************************************')
print('- Starting preprocessing')
dictionary, corpus, doc_list = preprocess(
								raw_docs = raw_docs, 
								preprocess_functions = preprocess_functions, 
								preprocess_steps_and_order = preprocess_steps_and_order, 
								debug=False)

*****************************************************
- Starting preprocessing

       ##### Lowercasing Done! Time Taken -  0.01073598861694336

       ##### Punctuation removed! Time Taken -  0.11254620552062988

       ##### Whitespace removed! Time Taken -  0.0592961311340332

       ##### POS Removal Done! Time Taken -  26.301908016204834

       ##### Lemmatization and Tokenization Done using Spacy! Time Taken -  48.417868852615356

       ##### Stopwords Removed using NLTK! Time Taken -  0.20621681213378906

       ##### Bi-Grams made using Gensim! Time Taken -  1.130192756652832

       ##### Tri-Grams made using Gensim! Time Taken -  1.0745000839233398
~~~ pre-processing done in  77.33268690109253
 
- Creating dictionary and corpus


In [7]:
print('*****************************************************')
print('- Starting model training')
lda_dict = lda_model_single_level(
					dictionary = dictionary,
					corpus = corpus,
					doc_list = doc_list,
					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
					coherence = specifications['coherence'],
					debug = specifications['debug'],
					need_best_topic = specifications['need_best_topic'],
					model_selection_metric = specifications['model_selection_metric']
					)

*****************************************************
- Starting model training
 
Sample data point:  ['dataset', 'transaction', 'credit', 'card', 'september', 'cardholder', 'dataset', 'transaction', 'day', 'fraud', 'transaction', 'dataset', 'class', 'fraud', 'account', 'transaction', 'input', 'variable', 'result', 'transformation', 'confidentiality', 'issue', 'feature', 'background', 'datum', 'feature', 'v28', 'component', 'pca', 'feature', 'pca', 'transaction', 'transaction', 'dataset', 'feature', 'amount', 'transaction', 'amount', 'feature', 'dependant', 'cost', 'feature', 'class', 'response', 'variable', 'value', 'case', 'fraud', 'class', 'imbalance', 'ratio', 'accuracy', 'area', 'precision', 'recall', 'curve', 'confusion', 'matrix', 'accuracy', 'classification', 'dataset', 'research', 'collaboration', 'worldline', 'machine', 'group', 'ac', 'libre', 'bruxelle', 'datum_mining', 'fraud', 'detection', 'detail', 'project', 'topic', 'http', 'mlg', 'brufence', 'http', 'mlg', 'dal', 'pozz

In [13]:
pyLDAvis.enable_notebook()
print(lda_dict['best_topic'])
print(lda_dict['coherence_score'])
print(lda_dict['perplexity_score'])
visualization = pyLDAvis.gensim.prepare(lda_dict['best_lda_model'], lda_dict['corpus'], lda_dict['dictionary'])
pyLDAvis.save_html(visualization, 'lda.html')
visualization

5
0.47154971390590406
-8.58700882191759


In [7]:
print('*****************************************************')
print('- Starting model training')
lda_level_1, lda_level_2 = lda_model_multi_level(
					level = specifications['level'],
					dictionary = dictionary,
					corpus = corpus,
					doc_list = doc_list,
					coherence = specifications['coherence'],
					debug = specifications['debug'],
					need_best_topic = specifications['need_best_topic'],
					model_selection_metric = specifications['model_selection_metric'],
					num_topics_list_level_1 = specifications['num_topics_list_level_1'], 
					num_topics_list_level_2 = specifications['num_topics_list_level_2'], 
					)

*****************************************************
- Starting model training
 
Sample data point:  ['dataset', 'transaction', 'credit', 'card', 'september', 'cardholder', 'dataset', 'transaction', 'day', 'fraud', 'transaction', 'dataset', 'class', 'fraud', 'account', 'transaction', 'input', 'variable', 'result', 'transformation', 'confidentiality', 'issue', 'feature', 'background', 'datum', 'feature', 'v28', 'component', 'pca', 'feature', 'pca', 'transaction', 'transaction', 'dataset', 'feature', 'amount', 'transaction', 'amount', 'feature', 'dependant', 'cost', 'feature', 'class', 'response', 'variable', 'value', 'case', 'fraud', 'class', 'imbalance', 'ratio', 'accuracy', 'area', 'precision', 'recall', 'curve', 'confusion', 'matrix', 'accuracy', 'classification', 'dataset', 'research', 'collaboration', 'worldline', 'machine', 'group', 'ac', 'libre', 'bruxelle', 'datum_mining', 'fraud', 'detection', 'detail', 'project', 'topic', 'http', 'mlg', 'brufence', 'http', 'mlg', 'dal', 'pozz

In [10]:
lda_level_1.keys()

dict_keys(['best_lda_model', 'best_topic', 'coherence_score', 'perplexity_score', 'corpus', 'dictionary', 'doc_list', 'all_models'])

In [23]:
lda_level_1['best_lda_model'].print_topics()

[(0,
  '0.011*"state" + 0.009*"country" + 0.007*"city" + 0.007*"type" + 0.006*"price" + 0.006*"value" + 0.006*"service" + 0.006*"code" + 0.006*"location" + 0.005*"variable"'),
 (1,
  '0.048*"serverruntime_i0_lphost06_type" + 0.022*"com_bea_name_serverruntime" + 0.014*"i0_lphost06_type_jdbcdatasourceruntime" + 0.013*"jdbcdatasourceruntime" + 0.013*"jdbcconnectionpoolruntime" + 0.010*"jdbcdatasourceruntime_com_bea_name" + 0.010*"wine" + 0.008*"review" + 0.008*"road" + 0.008*"i0_lphost06_type_jdbcconnectionpoolruntime"'),
 (2,
  '0.014*"player" + 0.013*"movie" + 0.011*"team" + 0.009*"game" + 0.009*"student" + 0.008*"baseball" + 0.007*"statistic" + 0.007*"score" + 0.007*"school" + 0.007*"result"'),
 (3,
  '0.014*"university" + 0.011*"image" + 0.007*"state_university" + 0.004*"kumar" + 0.003*"matrix" + 0.003*"de" + 0.003*"cell" + 0.002*"defense" + 0.002*"damage" + 0.002*"pokemon"'),
 (4,
  '0.009*"user" + 0.007*"text" + 0.007*"language" + 0.007*"word" + 0.006*"corpus" + 0.006*"attribution_c

In [11]:
lda_level_2.keys()

dict_keys([0, 1, 2, 3, 4])

In [24]:
lda_level_2[0]['best_lda_model'].print_topics()

[(0,
  '0.010*"price" + 0.007*"company" + 0.006*"crime" + 0.005*"state" + 0.005*"type" + 0.005*"value" + 0.005*"vehicle" + 0.005*"record" + 0.005*"city" + 0.004*"location"'),
 (1,
  '0.011*"state" + 0.011*"country" + 0.007*"survey" + 0.006*"health" + 0.006*"population" + 0.006*"people" + 0.005*"school" + 0.005*"rate" + 0.005*"education" + 0.004*"government"'),
 (2,
  '0.008*"city" + 0.007*"csv" + 0.005*"customer" + 0.004*"station" + 0.004*"location" + 0.004*"day" + 0.004*"system" + 0.004*"-PRON-" + 0.004*"service" + 0.004*"value"')]

In [25]:
lda_level_2[1]['best_lda_model'].print_topics()

[(0,
  '0.035*"serverruntime_i0_lphost06_type" + 0.016*"com_bea_name_serverruntime" + 0.011*"i0_lphost06_type_jdbcdatasourceruntime" + 0.010*"jdbcdatasourceruntime" + 0.010*"jdbcconnectionpoolruntime" + 0.008*"jdbcdatasourceruntime_com_bea_name" + 0.006*"i0_lphost06_type_jdbcconnectionpoolruntime" + 0.005*"jdbcconnectionpoolruntime_com_bea_name" + 0.003*"type_server_com_bea" + 0.003*"road"'),
 (1,
  '0.007*"wine" + 0.004*"gb" + 0.003*"aircraft" + 0.003*"output" + 0.003*"ru" + 0.003*"review" + 0.002*"plane" + 0.002*"crash" + 0.002*"quality" + 0.002*"de"'),
 (2,
  '0.005*"value" + 0.004*"airport" + 0.004*"state" + 0.003*"stop" + 0.003*"da" + 0.003*"airline" + 0.003*"🇧_🇷_🇬_🇧" + 0.003*"congressperson" + 0.002*"group" + 0.002*"bird"')]

In [26]:
lda_level_2[2]['best_lda_model'].print_topics()

[(0,
  '0.008*"movie" + 0.006*"horse" + 0.004*"dt" + 0.003*"place" + 0.002*"simadi" + 0.002*"dolartoday" + 0.002*"imdb" + 0.002*"employer" + 0.002*"film" + 0.002*"box_office"'),
 (1,
  '0.006*"model" + 0.006*"network" + 0.006*"pre_model_pre_model" + 0.005*"layer" + 0.005*"feature" + 0.004*"architecture" + 0.004*"accuracy" + 0.004*"representation" + 0.003*"depth" + 0.003*"imagenet"'),
 (2,
  '0.020*"player" + 0.014*"team" + 0.013*"game" + 0.009*"match" + 0.007*"point" + 0.005*"integer" + 0.005*"com" + 0.005*"movie" + 0.005*"csv" + 0.005*"result"')]