In [1]:
import pandas as pd

In [2]:
#the following two lines are required for me to use gensim (maybe because I'm using anaconda?)
#if you don't have problems using gensim as-is then ignore them
import smart_open
smart_open.open = smart_open.smart_open

In [3]:
from gensim.corpora import Dictionary
from gensim.models import LdaModel

In [4]:
#Read in data by subject dataframe from pickle file
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\subject_df.pkl"
subject_df = pd.read_pickle(path)

In [5]:
#Changed this option so that I could see the full descriptions
#Otherwise they're abbreviated, since they're so long
pd.set_option('display.max_colwidth', None)

In [6]:
subject_df.head(2)

Unnamed: 0,Abbreviation,Subject,Description
0,ACC,Accounting,"[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,ADC,African American and Diaspora Studies,"[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [7]:
#convert the description column into a giant list of lists of words by subject
words = subject_df["Description"].tolist()
words

[['accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'accounting',
  'activity',
  'activity',
  'activity',
  'advanced',
  'allocation',
  'also',
  'analysis',
  'analyzing',
  'application',
  'applied',
  'applied',
  'approved',
  'area',
  'area',
  'assessment',
  'asset',
  'asset',
  'asset',
  'assignment',
  'assurance',
  'assurance',
  'assurance',
  'attitude',
  'audit',
  'auditing',
  'auditing',
  'automated',
  'balance',
  'balance',
  'balance',
  'based',
  'based',
  'based',
  'begin',
  'budget',
  'budgeting',
  'budgeting',
  'business',
  'business',
  'business',
  'business',
  'business',
  'business',
  'capital',
  'capital',
  'cash',
  'cash',
  'change',
  'collection',
  'company',
  

In [8]:
#create a flattened version of the list for making dataframe columns and sort it
all_words = [item for sublist in words for item in sublist]
all_words = sorted(all_words)
all_words

['a.d.',
 'a.d.',
 'a.d.',
 'abating',
 'abbey',
 'abduction',
 'aberration',
 'aberration',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'ability',
 'abiotic',
 'abiotic',
 'abjection',
 'able',
 'abnormal',
 'abolition',
 'abolition',
 'abolition',
 'abolitionism',
 'abolitionist',
 'abortion',
 'abortion',
 'abortion',
 'abortion',
 'abortion',
 'abound',
 'abound',
 'abraham',
 'abraham',
 'abraham',
 'abrahamic',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'abroad',
 'absolutism',
 'absolutism',
 'absorption',
 'absorption',
 'abstract',
 'abstract',
 'abstract',
 'abstract',
 'absurd',
 'absurd',
 'abuse',
 'abuse',
 'abuse',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'academic',
 'acad

### used this as reference: https://www.tutorialspoint.com/gensim/gensim_creating_a_bag_of_words_corpus.htm
### also used the tutorial from the gensim docs: https://radimrehurek.com/gensim/auto_examples/tutorials/run_lda.html

In [9]:
#create a gensim dictionary of the words
dct = Dictionary(words)

In [10]:
#each unique word/token has been assigned an id number
dct.token2id.items()

dict_items([('accounting', 0), ('activity', 1), ('advanced', 2), ('allocation', 3), ('also', 4), ('analysis', 5), ('analyzing', 6), ('application', 7), ('applied', 8), ('approved', 9), ('area', 10), ('assessment', 11), ('asset', 12), ('assignment', 13), ('assurance', 14), ('attitude', 15), ('audit', 16), ('auditing', 17), ('automated', 18), ('balance', 19), ('based', 20), ('begin', 21), ('budget', 22), ('budgeting', 23), ('business', 24), ('capital', 25), ('cash', 26), ('change', 27), ('collection', 28), ('company', 29), ('compensation', 30), ('compliance', 31), ('component', 32), ('concept', 33), ('conceptual', 34), ('context', 35), ('control', 36), ('corporate', 37), ('corporation', 38), ('cost', 39), ('costing', 40), ('country', 41), ('cover', 42), ('coverage', 43), ('covered', 44), ('criminal', 45), ('current', 46), ('cycle', 47), ('data', 48), ('decision', 49), ('department', 50), ('depth', 51), ('derivative', 52), ('design', 53), ('designated', 54), ('determination', 55), ('devel

In [11]:
#the dfs method shows the number of subjects that each word appears in, by word id number
dct.dfs

{0: 4,
 1: 30,
 2: 22,
 3: 4,
 4: 34,
 5: 39,
 6: 16,
 7: 34,
 8: 20,
 9: 14,
 10: 29,
 11: 23,
 12: 2,
 13: 18,
 14: 1,
 15: 6,
 16: 1,
 17: 1,
 18: 1,
 19: 7,
 20: 38,
 21: 8,
 22: 4,
 23: 3,
 24: 15,
 25: 5,
 26: 1,
 27: 38,
 28: 10,
 29: 8,
 30: 1,
 31: 1,
 32: 18,
 33: 36,
 34: 12,
 35: 33,
 36: 14,
 37: 4,
 38: 3,
 39: 4,
 40: 1,
 41: 13,
 42: 13,
 43: 7,
 44: 14,
 45: 6,
 46: 37,
 47: 7,
 48: 26,
 49: 15,
 50: 12,
 51: 24,
 52: 2,
 53: 26,
 54: 1,
 55: 4,
 56: 39,
 57: 49,
 58: 34,
 59: 43,
 60: 2,
 61: 8,
 62: 5,
 63: 1,
 64: 25,
 65: 3,
 66: 2,
 67: 17,
 68: 14,
 69: 32,
 70: 19,
 71: 45,
 72: 27,
 73: 7,
 74: 41,
 75: 5,
 76: 1,
 77: 6,
 78: 45,
 79: 26,
 80: 10,
 81: 1,
 82: 27,
 83: 1,
 84: 7,
 85: 4,
 86: 26,
 87: 20,
 88: 4,
 89: 10,
 90: 4,
 91: 1,
 92: 12,
 93: 4,
 94: 17,
 95: 32,
 96: 8,
 97: 16,
 98: 12,
 99: 23,
 100: 5,
 101: 6,
 102: 14,
 103: 23,
 104: 24,
 105: 50,
 106: 1,
 107: 2,
 108: 10,
 109: 40,
 110: 1,
 111: 11,
 112: 24,
 113: 18,
 114: 27,
 115: 19,
 

In [12]:
#create a bag of words corpus
corpus = [dct.doc2bow(subject, allow_update=True) for subject in words]

In [13]:
#in the bow corpus, each subject now has a list of the id numbers for the words it contains and their frequency
#here is an example (i picked this row since it's short)
corpus[1]

[(71, 1),
 (211, 1),
 (212, 1),
 (213, 1),
 (214, 1),
 (215, 1),
 (216, 1),
 (217, 1),
 (218, 1)]

In [14]:
#and here are the words associated with the id nos above
words[1]

['african',
 'american',
 'capstone',
 'culture',
 'diaspora',
 'experience',
 'intedisciplinary',
 'minor',
 'varied']

In [15]:
corpus[0]

[(0, 21),
 (1, 3),
 (2, 1),
 (3, 1),
 (4, 1),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 2),
 (11, 1),
 (12, 3),
 (13, 1),
 (14, 3),
 (15, 1),
 (16, 1),
 (17, 2),
 (18, 1),
 (19, 3),
 (20, 3),
 (21, 1),
 (22, 1),
 (23, 2),
 (24, 6),
 (25, 2),
 (26, 2),
 (27, 1),
 (28, 1),
 (29, 1),
 (30, 1),
 (31, 1),
 (32, 2),
 (33, 2),
 (34, 1),
 (35, 1),
 (36, 5),
 (37, 2),
 (38, 2),
 (39, 4),
 (40, 3),
 (41, 1),
 (42, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (46, 2),
 (47, 1),
 (48, 1),
 (49, 9),
 (50, 1),
 (51, 3),
 (52, 1),
 (53, 1),
 (54, 1),
 (55, 1),
 (56, 1),
 (57, 1),
 (58, 1),
 (59, 1),
 (60, 1),
 (61, 1),
 (62, 5),
 (63, 1),
 (64, 1),
 (65, 1),
 (66, 1),
 (67, 2),
 (68, 1),
 (69, 1),
 (70, 2),
 (71, 1),
 (72, 1),
 (73, 1),
 (74, 1),
 (75, 7),
 (76, 1),
 (77, 2),
 (78, 3),
 (79, 1),
 (80, 2),
 (81, 1),
 (82, 1),
 (83, 1),
 (84, 1),
 (85, 1),
 (86, 1),
 (87, 1),
 (88, 1),
 (89, 2),
 (90, 1),
 (91, 1),
 (92, 1),
 (93, 5),
 (94, 1),
 (95, 2),
 (96, 1),
 (97, 3),
 (98, 1),
 (99, 1),
 (100, 2)

In [16]:
words[0]

['accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'accounting',
 'activity',
 'activity',
 'activity',
 'advanced',
 'allocation',
 'also',
 'analysis',
 'analyzing',
 'application',
 'applied',
 'applied',
 'approved',
 'area',
 'area',
 'assessment',
 'asset',
 'asset',
 'asset',
 'assignment',
 'assurance',
 'assurance',
 'assurance',
 'attitude',
 'audit',
 'auditing',
 'auditing',
 'automated',
 'balance',
 'balance',
 'balance',
 'based',
 'based',
 'based',
 'begin',
 'budget',
 'budgeting',
 'budgeting',
 'business',
 'business',
 'business',
 'business',
 'business',
 'business',
 'capital',
 'capital',
 'cash',
 'cash',
 'change',
 'collection',
 'company',
 'compensation',
 'compliance',
 'component',
 'component',
 'concept',
 

In [17]:
#print number of words/tokens in dictionary and number of documents (subjects)
print('Number of unique tokens: %d' % len(dct))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 6036
Number of documents: 69


In [18]:
#create new df from subject_df to put word matrix in
word_matrix = subject_df.copy()
#word_matrix.drop(columns = ["Subject"], inplace=True)
word_matrix.drop(columns = ["Description"], inplace=True)
word_matrix.head(3)

Unnamed: 0,Abbreviation,Subject
0,ACC,Accounting
1,ADC,African American and Diaspora Studies
2,ANT,Anthropology


In [19]:
#create a new column in the df for each unique word and initialize the frequency to 0
for word in all_words:
    word_matrix[word] = 0

In [20]:
word_matrix.head()

Unnamed: 0,Abbreviation,Subject,a.d.,abating,abbey,abduction,aberration,ability,abiotic,abjection,...,zealander,zen,zero,zhang,zhu,zionism,zizek,zone,zongsan,zoo
0,ACC,Accounting,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ADC,African American and Diaspora Studies,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ANT,Anthropology,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ARB,Arabic,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ART,Art,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
#i could make the column names the id nos of the words, but it's more readable if i use the words themselves
#so i will create a new list with the id nos replaced by the words
id_words = [[(dct[id], count) for id, count in line] for line in corpus]

In [22]:
id_words[1]

[('experience', 1),
 ('african', 1),
 ('american', 1),
 ('capstone', 1),
 ('culture', 1),
 ('diaspora', 1),
 ('intedisciplinary', 1),
 ('minor', 1),
 ('varied', 1)]

In [23]:
id_words[0]

[('accounting', 21),
 ('activity', 3),
 ('advanced', 1),
 ('allocation', 1),
 ('also', 1),
 ('analysis', 1),
 ('analyzing', 1),
 ('application', 1),
 ('applied', 2),
 ('approved', 1),
 ('area', 2),
 ('assessment', 1),
 ('asset', 3),
 ('assignment', 1),
 ('assurance', 3),
 ('attitude', 1),
 ('audit', 1),
 ('auditing', 2),
 ('automated', 1),
 ('balance', 3),
 ('based', 3),
 ('begin', 1),
 ('budget', 1),
 ('budgeting', 2),
 ('business', 6),
 ('capital', 2),
 ('cash', 2),
 ('change', 1),
 ('collection', 1),
 ('company', 1),
 ('compensation', 1),
 ('compliance', 1),
 ('component', 2),
 ('concept', 2),
 ('conceptual', 1),
 ('context', 1),
 ('control', 5),
 ('corporate', 2),
 ('corporation', 2),
 ('cost', 4),
 ('costing', 3),
 ('country', 1),
 ('cover', 1),
 ('coverage', 1),
 ('covered', 1),
 ('criminal', 1),
 ('current', 2),
 ('cycle', 1),
 ('data', 1),
 ('decision', 9),
 ('department', 1),
 ('depth', 3),
 ('derivative', 1),
 ('design', 1),
 ('designated', 1),
 ('determination', 1),
 ('devel

In [24]:
#go through id_words and change the frequency for each word in the df for each subject
i = 0
for subject in id_words:
    for word, count in subject:
        word_matrix.at[i, word] = count
    i += 1

In [25]:
#this is kind of hard to see since there are so many 0s, but it did work
word_matrix

Unnamed: 0,Abbreviation,Subject,a.d.,abating,abbey,abduction,aberration,ability,abiotic,abjection,...,zealander,zen,zero,zhang,zhu,zionism,zizek,zone,zongsan,zoo
0,ACC,Accounting,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ADC,African American and Diaspora Studies,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ANT,Anthropology,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ARB,Arabic,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ART,Art,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64,STAT,Statistics,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
65,SUS,Sustainability Science,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
66,TFA,Summer Undergraduate Research,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
67,THA,Theatre Arts,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
#if you check specific cells, you can see that the frequencies are correct
word_matrix.iloc[1]["experience"]

1

In [27]:
word_matrix.iloc[0]["accounting"]

21

In [25]:
#save word matrix to disk as pickle file
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\word_matrix.pkl"
word_matrix.to_pickle(path)

In [26]:
#save word matrix as csv file for readability
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\word_matrix.csv"
word_matrix.to_csv(path, index=False)

In [53]:
#attempt to run LDA on the data
#LDA: Latent Dirichlet Allocation

#Set training paramters
num_topics = 7
#since there's only 69 docs/subjects, I'll set the chunksize to 100
chunksize = 100
passes = 20
iterations = 400
eval_every = None

#Make an index to word dictionary
id2word = dct.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [54]:
#print the topics that lda produced
model.print_topics()

[(0,
  '0.024*"music" + 0.014*"performance" + 0.009*"skill" + 0.009*"musical" + 0.007*"development" + 0.006*"teaching" + 0.006*"leadership" + 0.006*"style" + 0.006*"technique" + 0.005*"emphasis"'),
 (1,
  '0.013*"writing" + 0.008*"work" + 0.006*"social" + 0.005*"text" + 0.005*"historical" + 0.005*"history" + 0.005*"american" + 0.005*"political" + 0.005*"culture" + 0.004*"seminar"'),
 (2,
  '0.010*"reading" + 0.007*"theory" + 0.007*"introduction" + 0.006*"classroom" + 0.006*"curriculum" + 0.006*"teacher" + 0.006*"writing" + 0.006*"economic" + 0.006*"development" + 0.005*"teaching"'),
 (3,
  '0.012*"religious" + 0.008*"religion" + 0.008*"christian" + 0.007*"practice" + 0.007*"tradition" + 0.006*"historical" + 0.006*"text" + 0.006*"philosophical" + 0.006*"nature" + 0.006*"christianity"'),
 (4,
  '0.012*"research" + 0.009*"social" + 0.007*"school" + 0.007*"application" + 0.006*"system" + 0.006*"design" + 0.006*"experience" + 0.006*"health" + 0.006*"project" + 0.006*"human"'),
 (5,
  '0.008

In [55]:
#assign topics to documents in corpus
lda_corpus = model[corpus]

In [56]:
lda_corpus[0]

[(2, 0.999519)]

In [57]:
#create df of topics by subject
topic_df = pd.DataFrame(list(zip(lda_corpus,words)),columns=['Topic','Description'])
topic_df.head(2)

Unnamed: 0,Topic,Description
0,"[(2, 0.999519)]","[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,"[(4, 0.14700171), (6, 0.8413743)]","[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [58]:
#add in other useful columns
topic_df.insert(0, "Abbreviation", subject_df["Abbreviation"])
topic_df.insert(1, "Subject", subject_df["Subject"])
topic_df.head(2)

Unnamed: 0,Abbreviation,Subject,Topic,Description
0,ACC,Accounting,"[(2, 0.999519)]","[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,ADC,African American and Diaspora Studies,"[(4, 0.14700171), (6, 0.8413743)]","[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [59]:
topics = topic_df.explode("Topic")
topics.head(2)

Unnamed: 0,Abbreviation,Subject,Topic,Description
0,ACC,Accounting,"(2, 0.999519)","[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,ADC,African American and Diaspora Studies,"(4, 0.14700171)","[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [60]:
topics[["Topic", "Probability"]] = pd.DataFrame(topics.Topic.tolist(), index=topics.index)
topics.head(2)

Unnamed: 0,Abbreviation,Subject,Topic,Description,Probability
0,ACC,Accounting,2,"[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]",0.999519
1,ADC,African American and Diaspora Studies,4,"[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]",0.147002


In [61]:
topics = topics[["Topic", "Probability", "Abbreviation", "Subject", "Description"]]
topics.head(2)

Unnamed: 0,Topic,Probability,Abbreviation,Subject,Description
0,2,0.999519,ACC,Accounting,"[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,4,0.147002,ADC,African American and Diaspora Studies,"[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [62]:
topics = topics.sort_values(by=["Topic", "Probability"], ascending=[True, False])
topics.head(2)

Unnamed: 0,Topic,Probability,Abbreviation,Subject,Description
49,0,0.999496,MSL,Military Science Leadership,"[ability, ability, ability, adaptive, adaptive, advantage, also, american, analysis, approach, arm, army, army, army, army, army, army, army, army, army, army, army, aspect, aspect, ass, ass, assessing, assessing, assessment, assessment, awareness, awareness, basic, basic, basic, basis, basis, battle, battle, battlefield, becoming, begin, building, building, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadet, cadre, cadre, challenge, challenge, challenging, challenging, character, civilian, coe, coe, colonial, communicate, communication, communication, communication, competency, complete, complex, complex, complex, complex, comprehension, contemporary, contemporary, context, continue, continued, continued, corp, courtesy, creative, critical, critical, critical, critical, current, current, custom, decision, decision, demand, demand, develop, ...]"
51,0,0.849173,MUS,Music,"[ability, academic, academic, accompanying, achievement, acquaint, acquisition, acquisition, action, active, active, actual, addition, address, administer, administering, administrative, adolescent, adult, advanced, advanced, advanced, advanced, aesthetic, aftermath, alexander, alexander, alexander, alphabet, alphabet, also, also, although, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analysis, analyze, analyzed, anatomy, announced, antiquity, application, application, application, application, application, application, application, application, applied, applied, applied, appreciate, appreciation, approach, approach, appropriate, appropriate, appropriate, appropriate, appropriate, appropriate, appropriate, appropriate, approval, approved, architectural, area, area, area, area, area, area, art, art, art, art, art, article, articulated, artist, artist, artistic, artistic, artistic, artistic, artistic, artistic, artistic, artistic, ...]"


In [63]:
topics_no_desc = topics.copy()
topics_no_desc.drop(columns=["Description"], inplace=True)
topics_no_desc.head(2)

Unnamed: 0,Topic,Probability,Abbreviation,Subject
49,0,0.999496,MSL,Military Science Leadership
51,0,0.849173,MUS,Music


In [64]:
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\lda_results.xlsx"
topics_no_desc.to_excel(path, index=False)

In [28]:
#Run LDA again with more topics
num_topics = 10
#since there's only 69 docs/subjects, I'll set the chunksize to 100
chunksize = 100
passes = 20
iterations = 400
eval_every = None

#Make an index to word dictionary
id2word = dct.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

In [29]:
model.print_topics()

[(0,
  '0.011*"reading" + 0.010*"work" + 0.009*"language" + 0.008*"literature" + 0.007*"introduction" + 0.007*"cultural" + 0.007*"skill" + 0.007*"culture" + 0.006*"german" + 0.006*"writing"'),
 (1,
  '0.017*"music" + 0.011*"performance" + 0.010*"laboratory" + 0.009*"technique" + 0.008*"emphasis" + 0.006*"musical" + 0.006*"skill" + 0.005*"field" + 0.005*"development" + 0.005*"work"'),
 (2,
  '0.014*"political" + 0.009*"policy" + 0.009*"medium" + 0.007*"public" + 0.007*"theory" + 0.007*"examination" + 0.007*"social" + 0.006*"chinese" + 0.006*"communication" + 0.006*"state"'),
 (3,
  '0.014*"school" + 0.010*"religious" + 0.010*"practice" + 0.009*"health" + 0.007*"human" + 0.007*"christian" + 0.007*"emphasis" + 0.006*"issue" + 0.006*"religion" + 0.006*"focus"'),
 (4,
  '0.013*"social" + 0.008*"history" + 0.007*"art" + 0.006*"development" + 0.006*"political" + 0.005*"cultural" + 0.005*"emphasis" + 0.005*"world" + 0.005*"experience" + 0.005*"project"'),
 (5,
  '0.010*"work" + 0.008*"research

In [30]:
#assign topics to documents in corpus
lda_corpus = model[corpus]

In [31]:
topic_df = pd.DataFrame(list(zip(lda_corpus,words)),columns=['Topic','Description'])
topic_df.head(2)

Unnamed: 0,Topic,Description
0,"[(8, 0.9993118)]","[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,"[(9, 0.97101665)]","[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [32]:
topic_df.insert(0, "Abbreviation", subject_df["Abbreviation"])
topic_df.insert(1, "Subject", subject_df["Subject"])
topic_df.head(2)

Unnamed: 0,Abbreviation,Subject,Topic,Description
0,ACC,Accounting,"[(8, 0.9993118)]","[accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, accounting, activity, activity, activity, advanced, allocation, also, analysis, analyzing, application, applied, applied, approved, area, area, assessment, asset, asset, asset, assignment, assurance, assurance, assurance, attitude, audit, auditing, auditing, automated, balance, balance, balance, based, based, based, begin, budget, budgeting, budgeting, business, business, business, business, business, business, capital, capital, cash, cash, change, collection, company, compensation, compliance, component, component, concept, concept, conceptual, context, control, control, control, control, control, corporate, corporate, corporation, corporation, cost, cost, cost, cost, costing, costing, costing, country, cover, coverage, covered, criminal, ...]"
1,ADC,African American and Diaspora Studies,"[(9, 0.97101665)]","[african, american, capstone, culture, diaspora, experience, intedisciplinary, minor, varied]"


In [33]:
topics = topic_df.explode("Topic")
topics[["Topic", "Probability"]] = pd.DataFrame(topics.Topic.tolist(), index=topics.index)
topics = topics[["Topic", "Probability", "Abbreviation", "Subject", "Description"]]
topics = topics.sort_values(by=["Topic", "Probability"], ascending=[True, False])
topics.drop(columns=["Description"], inplace=True)

In [34]:
path = r"C:\Users\Mandy\Documents\Data science projects\Furman catalog\lda_results2.xlsx"
topics.to_excel(path, index=False)