ENVIRON

In [47]:
import pandas as pd
import matplotlib.pyplot as plt

In [48]:
from bertopic import BERTopic
import tomotopy as tp

In [49]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
def visualize_lda(mdl):
    topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
    doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
    doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
    doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
    vocab = list(mdl.used_vocabs)
    term_frequency = mdl.used_vocab_freq

    prepared_data = pyLDAvis.prepare(
        topic_term_dists, 
        doc_topic_dists, 
        doc_lengths, 
        vocab, 
        term_frequency,
        start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
        sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
    )

DATASET

In [2]:
abstract_bertopic_df = pd.read_csv('./data/abstract-bertopic.csv', index_col = 0, low_memory = False)
review_bertopic_df = pd.read_csv('./data/review-bertopic.csv', index_col = 0, low_memory = False)
abstract_lda_df = pd.read_csv('./data/abstract_lda.csv', index_col = 0, low_memory = False)
review_lda_df = pd.read_csv('./data/review_lda.csv', index_col = 0, low_memory = False)

In [3]:
abstract_bertopic_model = BERTopic.load('./model/topic_model_unreduced.pkl')
review_bertopic_model = BERTopic.load('./model/topic_model_review_unreduced.pkl')
abstract_lda_model = tp.LDAModel.load('./model/lda-abstract-best.pkl')
review_lda_model = tp.LDAModel.load('./model/lda-review-best.pkl')

In [5]:
abstract_bertopic_topics = abstract_bertopic_model.approximate_distribution(abstract_bertopic_df['Abstract_Processed'])
review_bertopic_topics = review_bertopic_model.approximate_distribution(review_bertopic_df['Full Review_Processed'])

100%|██████████| 99/99 [01:17<00:00,  1.28it/s]
100%|██████████| 100/100 [03:14<00:00,  1.94s/it]


In [10]:
abstract_lda_topics = [doc.get_topic_dist() for doc in abstract_lda_model.docs]
review_lda_topics = [doc.get_topic_dist() for doc in review_lda_model.docs]

VISUALIZATION

In [52]:
abstract_lda_model.

TOPIC AGGREGATION

In [35]:
abstract_bertopic_topic_representations = pd.DataFrame(abstract_bertopic_topics[0])
abstract_bertopic_topic_representations.columns = [f'abstract_bertopic_topic_{topic}' for topic in abstract_bertopic_topic_representations.columns]
abstract_bertopic_topic_representations

Unnamed: 0,abstract_bertopic_topic_0,abstract_bertopic_topic_1,abstract_bertopic_topic_2,abstract_bertopic_topic_3,abstract_bertopic_topic_4,abstract_bertopic_topic_5,abstract_bertopic_topic_6,abstract_bertopic_topic_7,abstract_bertopic_topic_8,abstract_bertopic_topic_9,abstract_bertopic_topic_10,abstract_bertopic_topic_11
0,0.071108,0.318762,0.056557,0.000000,0.063262,0.000000,0.383023,0.017472,0.053807,0.000000,0.036009,0.000000
1,0.071108,0.318762,0.056557,0.000000,0.063262,0.000000,0.383023,0.017472,0.053807,0.000000,0.036009,0.000000
2,0.071108,0.318762,0.056557,0.000000,0.063262,0.000000,0.383023,0.017472,0.053807,0.000000,0.036009,0.000000
3,0.071108,0.318762,0.056557,0.000000,0.063262,0.000000,0.383023,0.017472,0.053807,0.000000,0.036009,0.000000
4,0.071108,0.318762,0.056557,0.000000,0.063262,0.000000,0.383023,0.017472,0.053807,0.000000,0.036009,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
98983,0.086730,0.000000,0.074623,0.149357,0.154452,0.182688,0.000000,0.000000,0.000000,0.196957,0.000000,0.155193
98984,0.086730,0.000000,0.074623,0.149357,0.154452,0.182688,0.000000,0.000000,0.000000,0.196957,0.000000,0.155193
98985,0.086730,0.000000,0.074623,0.149357,0.154452,0.182688,0.000000,0.000000,0.000000,0.196957,0.000000,0.155193
98986,0.086730,0.000000,0.074623,0.149357,0.154452,0.182688,0.000000,0.000000,0.000000,0.196957,0.000000,0.155193


In [36]:
review_bertopic_topic_representations = pd.DataFrame(review_bertopic_topics[0])
review_bertopic_topic_representations.columns = [f'review_bertopic_topic_{topic}' for topic in review_bertopic_topic_representations.columns]
review_bertopic_topic_representations

Unnamed: 0,review_bertopic_topic_0,review_bertopic_topic_1,review_bertopic_topic_2,review_bertopic_topic_3,review_bertopic_topic_4,review_bertopic_topic_5,review_bertopic_topic_6,review_bertopic_topic_7,review_bertopic_topic_8,review_bertopic_topic_9,review_bertopic_topic_10
0,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.402698,0.000000,0.000000,0.000000,0.000000,0.000000,0.246107,0.000000,0.000000,0.182754,0.168442
2,0.751222,0.000000,0.023090,0.008269,0.023078,0.023090,0.064026,0.042293,0.000000,0.036224,0.028709
3,0.783680,0.000000,0.000000,0.000000,0.000000,0.000000,0.076833,0.087383,0.000000,0.034376,0.017729
4,0.372212,0.063482,0.066340,0.071169,0.066306,0.066340,0.104592,0.000000,0.063482,0.062446,0.063629
...,...,...,...,...,...,...,...,...,...,...,...
99040,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
99041,0.619002,0.000000,0.057645,0.061060,0.057616,0.057645,0.069685,0.000000,0.000000,0.037801,0.039545
99042,0.566328,0.000000,0.000000,0.000000,0.000000,0.000000,0.121254,0.188081,0.000000,0.095085,0.029251
99043,0.103074,0.000000,0.000000,0.000000,0.000000,0.000000,0.310483,0.000000,0.000000,0.298625,0.287818


In [38]:
abstract_lda_topic_representations = pd.DataFrame(abstract_lda_topics)
abstract_lda_topic_representations.columns = [f'abstract_lda_topic_{topic}' for topic in abstract_lda_topic_representations.columns]
abstract_lda_topic_representations

Unnamed: 0,abstract_lda_topic_0,abstract_lda_topic_1,abstract_lda_topic_2,abstract_lda_topic_3,abstract_lda_topic_4,abstract_lda_topic_5,abstract_lda_topic_6,abstract_lda_topic_7,abstract_lda_topic_8,abstract_lda_topic_9,...,abstract_lda_topic_39,abstract_lda_topic_40,abstract_lda_topic_41,abstract_lda_topic_42,abstract_lda_topic_43,abstract_lda_topic_44,abstract_lda_topic_45,abstract_lda_topic_46,abstract_lda_topic_47,abstract_lda_topic_48
0,0.000166,0.000153,0.000103,0.000058,0.000051,0.000068,0.000084,0.000049,0.000144,0.074198,...,0.000118,0.000058,0.000084,0.000126,0.000115,0.000067,0.000113,0.000036,0.000054,0.000053
1,0.000166,0.000153,0.000103,0.000058,0.000051,0.000068,0.000084,0.000049,0.000144,0.000077,...,0.000118,0.000058,0.000084,0.000126,0.000115,0.000067,0.000113,0.000036,0.000054,0.000053
2,0.000166,0.000153,0.000103,0.000058,0.000051,0.000068,0.000084,0.000049,0.000144,0.000077,...,0.000118,0.000058,0.000084,0.000126,0.000115,0.000067,0.000113,0.000036,0.000054,0.000053
3,0.000166,0.000153,0.000103,0.000058,0.000051,0.000068,0.000084,0.000049,0.000144,0.000077,...,0.000118,0.000058,0.000084,0.000126,0.000115,0.000067,0.000113,0.000036,0.000054,0.000053
4,0.000166,0.000153,0.000103,0.000058,0.000051,0.000068,0.000084,0.000049,0.000144,0.000077,...,0.000118,0.000058,0.000084,0.000126,0.000115,0.000067,0.000113,0.000036,0.000054,0.000053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98983,0.000132,0.000122,0.000082,0.000046,0.000040,0.000054,0.000067,0.000039,0.000115,0.000062,...,0.000094,0.937229,0.000067,0.000101,0.000092,0.000053,0.000090,0.000029,0.000043,0.000042
98984,0.000132,0.000122,0.008526,0.118249,0.000040,0.000054,0.000067,0.000039,0.000115,0.000062,...,0.000094,0.852798,0.000067,0.000101,0.000092,0.000053,0.000090,0.000029,0.000043,0.000042
98985,0.000132,0.000122,0.000082,0.000046,0.000040,0.000054,0.000067,0.000039,0.008558,0.000062,...,0.000094,0.971001,0.000067,0.000101,0.000092,0.000053,0.000090,0.000029,0.000043,0.000042
98986,0.000132,0.000122,0.000082,0.000046,0.000040,0.000054,0.000067,0.000039,0.000115,0.000062,...,0.000094,0.971001,0.000067,0.000101,0.000092,0.000053,0.000090,0.000029,0.000043,0.000042


In [39]:
review_lda_topic_representations = pd.DataFrame(review_lda_topics)
review_lda_topic_representations.columns = [f'review_lda_topic_{topic}' for topic in review_lda_topic_representations.columns]
review_lda_topic_representations

Unnamed: 0,review_lda_topic_0,review_lda_topic_1,review_lda_topic_2,review_lda_topic_3,review_lda_topic_4,review_lda_topic_5,review_lda_topic_6,review_lda_topic_7,review_lda_topic_8,review_lda_topic_9,review_lda_topic_10,review_lda_topic_11,review_lda_topic_12,review_lda_topic_13
0,0.552294,0.003195,0.001997,0.000178,0.002764,0.004384,0.002543,0.003189,0.075867,0.094083,0.253402,0.000758,0.003390,0.001956
1,0.341064,0.101453,0.000405,0.000036,0.124276,0.060456,0.082993,0.170183,0.107021,0.000744,0.000966,0.000154,0.009851,0.000396
2,0.142388,0.009163,0.000132,0.000012,0.110590,0.000289,0.085212,0.253850,0.195984,0.052462,0.062979,0.000050,0.000224,0.086665
3,0.227314,0.029301,0.000232,0.000021,0.134447,0.000510,0.126531,0.431676,0.021977,0.013577,0.000555,0.000088,0.003024,0.010747
4,0.045792,0.000317,0.094317,0.000018,0.441739,0.000435,0.000252,0.282674,0.003040,0.105688,0.000473,0.000075,0.000336,0.024844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99040,0.133346,0.000840,0.000525,0.000047,0.018558,0.001153,0.000669,0.000839,0.210153,0.000965,0.126074,0.000199,0.066274,0.440358
99041,0.067702,0.000237,0.018566,0.000013,0.000205,0.135948,0.000188,0.000236,0.029061,0.018690,0.042212,0.000056,0.070574,0.616311
99042,0.447885,0.000630,0.000394,0.000035,0.022843,0.000865,0.000502,0.000629,0.211185,0.000724,0.000941,0.000150,0.000669,0.312549
99043,0.301566,0.000845,0.000528,0.000047,0.000731,0.084871,0.000673,0.000844,0.002131,0.000971,0.001261,0.000201,0.006876,0.598455


In [41]:
abstract_df = pd.concat([abstract_bertopic_df, abstract_bertopic_topic_representations, abstract_lda_topic_representations], axis = 1)

In [42]:
review_df = pd.concat([review_bertopic_df, review_bertopic_topic_representations, review_lda_topic_representations], axis = 1)

In [45]:
review_df.columns

Index(['Paper Title', 'Paper Link', 'Accept', 'Keywords', 'TL;DR', 'Abstract',
       'Data', 'Review Title', 'Review Signature', 'Review Date',
       'Review Meta', 'Review Subtitles', 'Review Contents', 'Code',
       'Community Implementations', 'Original Pdf', 'Reviewed Version (pdf)',
       'Code Of Ethics', 'Supplementary Material', 'One-sentence Summary',
       'One Line Summary', 'Acknowledgement Of Code Of Ethics',
       'Anonymous Url', 'No Acknowledgement Section', 'Submission Guidelines',
       'Please Choose The Closest Area That Your Submission Falls Into',
       'Full Review', 'embedding', 'Full Review_Processed', 'fine_topic',
       'fine_probs', 'review_bertopic_topic_0', 'review_bertopic_topic_1',
       'review_bertopic_topic_2', 'review_bertopic_topic_3',
       'review_bertopic_topic_4', 'review_bertopic_topic_5',
       'review_bertopic_topic_6', 'review_bertopic_topic_7',
       'review_bertopic_topic_8', 'review_bertopic_topic_9',
       'review_bertopic_

In [50]:
clf = DecisionTreeClassifier()

In [None]:
clf.fit()