In [None]:
#do the imports
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

#reading in the cleaned df
df = pd.read_csv("NewScrapedData.csv")

#create the TfidfVectorizer. min_df says any word that does not appear in x rows get rid of. 
# max_df says any word that appears in more than x proportion of documents, get rid of
tfidf = TfidfVectorizer(min_df = 5, max_df = .75)

#tokenize and perform TF-IDF vectorizer
doc_term_matrix = tfidf.fit_transform(df["New Description"])

doc_term_matrix

<428x303 sparse matrix of type '<class 'numpy.float64'>'
	with 4122 stored elements in Compressed Sparse Row format>

In [14]:
#choose how many topics we want to create
num_topics = 20

# do the LDA using the number of topics and randomstate. Creating the model
LDA = LatentDirichletAllocation(n_components=num_topics, random_state=12345)

#train model. Learn from the data, then transform.
doc_topic_matrix = LDA.fit_transform(doc_term_matrix)

# name the columns just numerically
col_names = [f'Topic {x}' for x in range(1, num_topics+1)]

#create the df
doc_topic_df = pd.DataFrame(doc_topic_matrix, columns=col_names)

doc_topic_df.head(50)



Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.01227,0.570626,0.01227,0.01227,0.208509,0.01227,0.01227,0.01227,0.01227,0.01227
1,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.784057,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365,0.011365
2,0.011843,0.011843,0.637981,0.011843,0.148844,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843,0.011843
3,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417,0.730773,0.01417,0.01417,0.01417,0.01417,0.01417,0.01417
4,0.732022,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104,0.014104
5,0.011809,0.246087,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809,0.541353,0.011809,0.011809,0.011809,0.011809,0.011809,0.011809
6,0.012221,0.012221,0.357691,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.012221,0.422334
7,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.009697,0.6566,0.009697,0.009697,0.168854,0.009697
8,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.010295,0.804392,0.010295
9,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.01259,0.760782,0.01259,0.01259,0.01259,0.01259


In [21]:
#top x amount of words 
top_x_words = 20

#create empty df
topic_dist_df = pd.DataFrame()

#iterate through the LDA components and calculate word weight. Then sort in descending order
#append to topic_dist_df
for topic, words in enumerate(LDA.components_):
    word_total = words.sum()
    sorted_words = words.argsort()[::-1]
    if topic == 4:
        print(f'\nTopic {topic + 1:02d}')
    lst = []
    for i in range(0, top_x_words):
        word = tfidf.get_feature_names_out()[sorted_words[i]]
        word_weight = words[sorted_words[i]]
        lst.append(f'{word} ({word_weight:.3f})')
        #print(f'    {word} ({word_weight:.3f})')
        if topic == 4:
            print(f'{word}')
    topic_dist_df[f'Topic {topic + 1}'] = lst

topic_dist_df


Topic 05
team
compete
competitive
play
offer
competes
woman
collegiate
california
join
level
travel
competition
player
new
league
game
national
welcome
throughout


Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,Topic 11,Topic 12,Topic 13,Topic 14,Topic 15,Topic 16,Topic 17,Topic 18,Topic 19,Topic 20
0,support (3.825),fraternity (4.686),professional (6.285),phi (2.266),team (6.415),purpose (2.509),home (3.746),practice (3.841),honor (3.248),world (2.402),sport (3.452),dance (5.625),design (2.832),student (5.213),student (4.474),co (2.394),event (3.659),association (2.620),sorority (3.812),student (2.702)
1,program (2.746),life (3.785),student (4.804),delta (1.862),compete (3.246),teach (2.182),social (2.427),experience (3.007),society (2.989),language (1.463),member (2.668),game (3.399),building (2.612),social (4.917),opportunity (2.663),ed (2.095),quarter (2.691),resource (2.384),leadership (2.376),project (2.537)
2,student (1.879),campus (3.023),study (4.412),well (1.711),competitive (3.226),learn (2.077),provide (2.308),skill (2.933),workshop (2.486),hands (1.428),family (2.365),coast (3.217),community (2.007),promote (4.396),service (2.540),educate (2.058),center (2.390),management (1.429),sisterhood (2.258),information (2.015)
3,major (1.703),sigma (2.892),industry (4.144),value (1.601),play (3.173),high (1.804),academic (2.202),inclusive (2.840),engineering (2.382),discussion (1.295),casual (2.290),love (3.005),build (1.996),provide (3.695),networking (2.458),develop (2.025),cultural (2.372),student (1.265),phi (2.208),give (2.008)
4,community (1.698),college (2.646),development (3.667),art (1.347),offer (2.903),increase (1.421),luis (1.912),environment (2.728),leader (1.814),future (1.245),team (2.271),tournament (2.656),social (1.509),professional (3.691),provide (2.389),focus (1.879),educational (2.300),business (1.247),kappa (2.173),volunteer (1.933)
5,department (1.657),men (2.415),within (3.396),pre (1.086),competes (2.601),within (1.359),obispo (1.912),student (2.401),speaker (1.786),business (1.232),competitive (2.146),share (2.314),connect (1.150),opportunity (3.530),job (2.254),professional (1.780),register (2.188),theta (1.075),advocate (1.972),organization (1.693)
6,year (1.419),pi (2.092),design (2.880),research (0.973),woman (2.582),come (1.349),san (1.912),welcome (2.179),student (1.655),based (1.231),campus (1.971),support (2.059),resource (0.927),science (3.498),health (2.006),fraternity (1.665),profile (1.916),workshop (0.939),service (1.892),on (1.685)
7,country (1.415),brotherhood (2.029),communication (2.800),culture (0.862),collegiate (2.491),primary (1.309),cultural (1.910),level (2.169),hold (1.493),real (1.034),community (1.846),christ (1.985),student (0.806),community (3.194),connect (1.821),unite (1.664),continue (1.822),promotes (0.881),alpha (1.616),affiliate (1.564)
8,free (1.407),american (1.845),application (2.780),dedicate (0.725),california (2.466),need (1.223),away (1.841),event (2.141),provide (1.447),help (0.940),promote (1.774),educational (1.972),celebrate (0.782),field (3.062),meet (1.748),individual (1.564),host (1.794),weekly (0.843),idea (1.525),opportunity (1.554)
9,international (1.386),find (1.779),society (2.424),gain (0.668),join (2.411),go (1.154),interested (1.537),necessary (1.984),chi (1.432),foster (0.938),week (1.591),central (1.869),want (0.741),food (2.932),major (1.380),team (1.559),experience (1.694),chapter (0.842),scholarship (1.465),professional (1.514)
