In [1]:
import pandas as pd
import pickle

from gensim import corpora, models, matutils
from gensim.models.coherencemodel import CoherenceModel
import scipy.sparse

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from skills.data_preprocess.data.inputs import *
from skills.data_preprocess.data_transform import get_syl_list_2018, get_syl_list_2014, run_data_transform_pipeline

import logging

import pyLDAvis
import pyLDAvis.gensim_models

import warnings
warnings.filterwarnings('ignore')

LDA for 2018 Syllabus Data

In [2]:
syl_18 = get_syl_list_2018()

In [3]:
syl_18_df = pd.DataFrame(syl_18)
syl_18_df.columns = ['description']

In [4]:
data_dtm_18, cv_18 = run_data_transform_pipeline(syl_18_df, False)

In [5]:
data_dtm_18.head()

Unnamed: 0,Unnamed: 1,introduction,access,access cell,access content,access control,access introduction,access links,access packages,access wireless,...,yacc,yacc intermediate,yacc top,yarn,yarn mapreduce,z,z buffer,z nature,zone,zone indexes
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


EXPLORATORY DATA ANALYSIS

In [6]:
data_18 = data_dtm_18.transpose()
top_dict_18 = {}
for c in data_18.columns:
    top = data_18[c].sort_values(ascending=False).head(30)
    top_dict_18[c]= list(zip(top.index, top.values))
print(top_dict_18)

{0: [('lattices', 4), ('boolean', 3), ('groups', 3), ('graphs', 3), ('predicate calculus', 2), ('propositional predicate', 2), ('calculus', 2), ('predicate', 2), ('boolean algebra', 2), ('lattices boolean', 2), ('permutations', 2), ('theory', 2), ('subgroups', 2), ('propositional', 2), ('properties', 2), ('trees', 2), ('graph', 2), ('basic', 2), ('algebra', 2), ('ordering permutations', 1), ('hamiltonian graphs', 1), ('hamiltonian', 1), ('function principle', 1), ('generating', 1), ('groups subgroups', 1), ('groups monoids', 1), ('generating function', 1), ('graph theory', 1), ('groups ', 1), ('graphs algorithms', 1)], 1: [('state', 4), ('logic', 3), ('circuits', 3), ('design', 2), ('adder', 2), ('gates', 2), ('sequential circuits', 2), ('sequential', 2), ('registers', 2), ('counters', 2), ('decomposition', 1), ('decoder encoder', 1), ('decoder', 1), ('ring', 1), ('factoring functional', 1), ('product sums', 1), ('encoder', 1), ('products product', 1), ('products', 1), ('encoder code',

In [7]:
for job_role, top_words in top_dict_18.items():
    print(job_role)
    print(', '.join([word for word, count in top_words[0:30]]))
    print('---')

0
lattices, boolean, groups, graphs, predicate calculus, propositional predicate, calculus, predicate, boolean algebra, lattices boolean, permutations, theory, subgroups, propositional, properties, trees, graph, basic, algebra, ordering permutations, hamiltonian graphs, hamiltonian, function principle, generating, groups subgroups, groups monoids, generating function, graph theory, groups , graphs algorithms
---
1
state, logic, circuits, design, adder, gates, sequential circuits, sequential, registers, counters, decomposition, decoder encoder, decoder, ring, factoring functional, product sums, encoder, products product, products, encoder code, nmos, ring johnson, ripple, ripple counters, product, unsigned bcd, unsigned, processor bit, processor, forms k
---
2
memory, representation, cache, point, hardware, cache coherence, protocol, point representation, write, organization, data dependencies, interrupts, micro, control, multiplier, memories, dependencies, programmed, micro programmed,

In [8]:
tdm_18 = data_dtm_18.transpose()
tdm_18.head()

sparse_counts18 = scipy.sparse.csr_matrix(tdm_18)
corpus18 = matutils.Sparse2Corpus(sparse_counts18)

id2word18 = dict((v, k) for k, v in cv_18.vocabulary_.items())
texts18 = [[word for word in list(cv_18.vocabulary_.keys())]]
dictionary18 = corpora.Dictionary(texts18)

In [None]:
logging.basicConfig(filename='LdaSyllabus18.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO, force=True)

In [10]:
lda7 = models.LdaModel(corpus=corpus18, num_topics=30, id2word=id2word18, passes=60, iterations= 9000, chunksize = 20000, random_state=42)
lda7.print_topics(30,30)

[(0,
  '0.015*"system" + 0.010*"control" + 0.010*"process" + 0.008*"operating system" + 0.008*"operating" + 0.007*"multicore" + 0.007*"organization" + 0.007*"planning" + 0.007*"scheduling" + 0.005*"nature" + 0.005*"types" + 0.005*"" + 0.005*"structure" + 0.005*"memory" + 0.003*"managerial" + 0.003*"span" + 0.003*"nature purpose" + 0.003*"purpose" + 0.003*"leadership" + 0.003*"planning process" + 0.003*"authority" + 0.003*"development" + 0.003*"business" + 0.003*"theory" + 0.003*"basic" + 0.003*"computers" + 0.003*"performance" + 0.003*"core" + 0.003*"supercomputers" + 0.003*"software"'),
 (1,
  '0.000*"peephole optimization" + 0.000*"penalties regularization" + 0.000*"penalties" + 0.000*"perceptron types" + 0.000*"partitioning" + 0.000*"performance increase" + 0.000*"performance metrics" + 0.000*"performance mpi" + 0.000*"performance software" + 0.000*"periodic" + 0.000*"perceptron" + 0.000*"performance" + 0.000*"partitioning markov" + 0.000*"peephole" + 0.000*"pca ica" + 0.000*"pca" +

In [11]:
coherencemodel = CoherenceModel(model=lda7, texts=texts18, dictionary=dictionary18, coherence='c_v')
print(coherencemodel.get_coherence())

0.5630152591433188


In [12]:
f = open('models/lda_final_18.pickle', 'wb')
pickle.dump(lda7, f)
f.close()

In [13]:
f = open('models/lda_final_18.pickle', 'rb')
lda_final_18 = pickle.load(f)
f.close()

TOPIC VISUALIZATION 

In [15]:
# Topic Visualization for syllabus 18 model
pyLDAvis.enable_notebook()
viz18 = pyLDAvis.gensim_models.prepare(lda_final_18, corpus18, dictionary18)
viz18


LDA for 2014 Syllabus

In [16]:
syl_14 = get_syl_list_2014()

In [38]:
syl_14_df = pd.DataFrame(syl_14)
syl_14_df.columns = ['description']

In [39]:
data_dtm_14, cv_14 = run_data_transform_pipeline(syl_14_df, False)

In [41]:
data_14 = data_dtm_14.transpose()
top_dict_14 = {}
for c in data_14.columns:
    top = data_14[c].sort_values(ascending=False).head(30)
    top_dict_14[c]= list(zip(top.index, top.values))
print(top_dict_14)

{0: [('lattices', 4), ('groups', 3), ('theory', 3), ('graphs', 3), ('boolean', 3), ('boolean algebra', 2), ('trees', 2), ('predicate calculus', 2), ('predicate', 2), ('permutations', 2), ('basic', 2), ('graph', 2), ('ordering', 2), ('calculus', 2), ('properties', 2), ('lattices boolean', 2), ('propositional', 2), ('propositional predicate', 2), ('subgroups', 2), ('algebra', 2), ('definitions', 1), ('calculus well', 1), ('eulerian', 1), ('calculus elementary', 1), ('inference', 1), ('degree regular', 1), ('definitions degree', 1), ('degree', 1), ('permutations combinations', 1), ('permutations lexicographical', 1)], 1: [('memory', 5), ('numbers', 4), ('basic', 3), ('operations', 3), ('multiplication', 3), ('introduction', 2), ('introduction basic', 2), ('performance', 2), ('concepts', 2), ('signed', 2), ('fast', 2), ('design', 2), ('modes', 1), ('memory operations', 1), ('hierarchy', 1), ('memory locations', 1), ('hierarchy accessing', 1), ('virtual memory', 1), ('operations characters'

In [42]:
for job_role, top_words in top_dict_14.items():
    print(job_role)
    print(', '.join([word for word, count in top_words[0:30]]))
    print('---')

0
lattices, groups, theory, graphs, boolean, boolean algebra, trees, predicate calculus, predicate, permutations, basic, graph, ordering, calculus, properties, lattices boolean, propositional, propositional predicate, subgroups, algebra, definitions, calculus well, eulerian, calculus elementary, inference, degree regular, definitions degree, degree, permutations combinations, permutations lexicographical
---
1
memory, numbers, basic, operations, multiplication, introduction, introduction basic, performance, concepts, signed, fast, design, modes, memory operations, hierarchy, memory locations, hierarchy accessing, virtual memory, operations characters, modes addition, memory hierarchy, operations addressing, operations introduction, memory common, memory access, , measuring, functional, arithmetic, functional units
---
2
circuits, adder, logic, gates, arithmetic, verilog, sequential, synchronous sequential, logic gates, registers, synchronous, combinational, combinational circuits, fan,

In [43]:
tdm_14 = data_dtm_14.transpose()
tdm_14.head()

sparse_counts14 = scipy.sparse.csr_matrix(tdm_14)
corpus14 = matutils.Sparse2Corpus(sparse_counts14)

id2word14 = dict((v, k) for k, v in cv_14.vocabulary_.items())
texts14 = [[word for word in list(cv_14.vocabulary_.keys())]]
dictionary14 = corpora.Dictionary(texts14)

In [48]:
logging.basicConfig(filename='LdaSyllabus14.log',
                    format="%(asctime)s:%(levelname)s:%(message)s",
                    level=logging.INFO, force=True)

In [50]:
lda1 = models.LdaModel(corpus=corpus14, num_topics=30, id2word=id2word14, passes=60, iterations= 9000, chunksize = 20000, random_state=42)
lda1.print_topics(30,30)

[(0,
  '0.019*"multimedia" + 0.016*"synchronization" + 0.016*"compression" + 0.006*"video" + 0.006*"principles" + 0.006*"ip" + 0.006*"demand" + 0.003*"data" + 0.003*"applications" + 0.003*"quality" + 0.003*"" + 0.003*"model" + 0.003*"dimensions" + 0.003*"management" + 0.003*"schemes video" + 0.003*"specification methods" + 0.003*"qos ip" + 0.003*"image" + 0.003*"model synchronization" + 0.003*"issues synchronization" + 0.003*"environment multimedia" + 0.003*"protocols prioritized" + 0.003*"application networking" + 0.003*"specification" + 0.003*"data transmission" + 0.003*"environment" + 0.003*"schemes" + 0.003*"applications application" + 0.003*"protocols" + 0.003*"application"'),
 (1,
  '0.011*"project" + 0.011*"students" + 0.009*"marketing" + 0.009*"market" + 0.007*"course" + 0.005*"research" + 0.005*"gained" + 0.005*"opportunity" + 0.005*"projects" + 0.005*"" + 0.005*"new" + 0.005*"work" + 0.005*"expected" + 0.005*"value" + 0.005*"knowledge" + 0.005*"customer" + 0.005*"implementati

In [51]:
coherencemodel = CoherenceModel(model=lda1, texts=texts14, dictionary=dictionary14, coherence='c_v')
print(coherencemodel.get_coherence())

0.5818785665361321


In [52]:
f = open('models/lda_final_14.pickle', 'wb')
pickle.dump(lda1, f)
f.close()

In [53]:
f = open('models/lda_final_14.pickle', 'rb')
lda_final_14 = pickle.load(f)
f.close()

In [54]:
# Topic Visualization for syllabus 14 model
pyLDAvis.enable_notebook()
viz14 = pyLDAvis.gensim_models.prepare(lda_final_14, corpus14, dictionary14)
viz14