In [1]:
import pandas as pd
import pickle
import numpy as np

from bertopic import BERTopic

In [2]:
with open('CleanedControversialPostsFuturology.csv', 'rb') as f:
  df = pd.read_csv(f)

In [3]:
df = df.dropna(axis=0, subset=['selftext'])
df = df.dropna(axis=0, subset=['created_datetime'])
df['flair_text'] = np.where(df['flair_text'].isna(), 'unknown', df['flair_text']) #replace empty flair text fileds with unknown

posts = df.selftext.to_list()
flairs = df.flair_text.to_list()
timestamps = df['created_datetime'].apply(lambda x: pd.Timestamp(x))
topic_model = BERTopic(n_gram_range=(1,2)) #only unigram and bigram terms
topics, _ = topic_model.fit_transform(posts)
topics_over_time = topic_model.topics_over_time(docs=posts, 
                                                timestamps=timestamps, 
                                                global_tuning=True, 
                                                evolution_tuning=True, 
                                                nr_bins=20)

(974, 11)


In [4]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,-1,355,-1_people_years_new_think
1,0,68,0_ai_artificial intelligence_intelligence_arti...
2,1,51,1_reality_iphone_think_vr
3,2,48,2_world_people_human_one
4,3,45,3_robots_robot_jobs_take jobs
5,4,34,4_climate_climate change_change_human
6,5,34,5_reddit_sub_posts_futurology
7,6,30,6_meat_dairy_lab_vegan
8,7,30,7_musk_elon musk_elon_spacex
9,8,28,8_solar_tesla_fuel_hydrogen


In [5]:
topic_model.get_topics()

{-1: [('people', 0.012704964126552833),
  ('years', 0.008137399899906984),
  ('new', 0.007307168083551476),
  ('think', 0.0071732298973634385),
  ('future', 0.007010266834350849),
  ('human', 0.0063999716738248995),
  ('world', 0.006247961326273264),
  ('even', 0.006018938726631929),
  ('many', 0.005765170641780116),
  ('humans', 0.0057596197636402184)],
 0: [('ai', 0.08281503774695712),
  ('artificial intelligence', 0.05736464089038085),
  ('intelligence', 0.056597090338166546),
  ('artificial', 0.05492365699161981),
  ('human', 0.02465172213358898),
  ('google', 0.01813851824053835),
  ('smarter', 0.015959468479810668),
  ('computer', 0.015871203460471054),
  ('intelligence ai', 0.015367771982843182),
  ('threat', 0.01479806974204694)],
 1: [('reality', 0.017826835939489527),
  ('iphone', 0.017736835017091526),
  ('think', 0.016785286389940066),
  ('vr', 0.016306802414419615),
  ('virtual reality', 0.01569599656013873),
  ('technology', 0.015531109678299768),
  ('future', 0.015291821

In [7]:
topic_model.visualize_topics_over_time(topics_over_time)

In [8]:
topic_model.visualize_barchart()

In [13]:
df['flair_text'].value_counts()

article                          152
unknown                          124
Discussion                       108
text                             103
Society                           96
Environment                       52
AI                                52
Space                             40
video                             35
Energy                            29
Economics                         28
Transport                         27
Robotics                          26
Biotech                           24
Computing                         19
meta                              14
blog                              10
image                             10
article - misleading title         3
misleading title                   2
article - sensational title        2
Rule 2 - Future focus              2
other                              2
MISLEADING TITLE                   2
3DPrint                            2
Slides                             1
Medicine                           1
r

In [37]:
pd.DataFrame(df['flair_text'].value_counts()).head(20)

Unnamed: 0,flair_text
article,152
unknown,124
Discussion,108
text,103
Society,96
Environment,52
AI,52
Space,40
video,35
Energy,29


In [14]:
df['flair_text'].nunique()

35

In [32]:
topics_to_labels = {-1: 'Miscellaneous', 0: 'AI', 1: 'Technology', 2: 'Society', 3: 'Robots, Jobs and Automation', 4: 'Climate Change', 5: 'Reddit', 6: 'Food', 7: 'Elon and Trump', 8: 'Renewable Energy', 9: 'Self Driving Cars', 10: 'Cryptocurrency', 11:'People', 12: 'Life Science', 13: 'Science Fiction', 14: 'Nuclear Energy', 15: 'Science Fiction', 16: 'Gender', 17: 'World Politics and Guns', 18: 'Aliens', 19: 'Income', 20: 'Covid Research', 21: 'Life Science', 22: 'Space Travel'}

In [34]:
topics_per_class = topic_model.topics_per_class(posts, classes=flairs)
topics_per_class['Self Labelled Topics'] = topics_per_class['Topic'].apply(lambda x: topics_to_labels[x])

In [35]:
topics_per_class.to_csv('TopicsPerFlair.csv')

In [11]:
topic_model.visualize_topics_per_class(topics_per_class)

In [9]:
with open('saved_model.pickle', 'wb') as f:
    pickle.dump(topic_model, f)

In [27]:
topics_to_labels = {-1: 'Miscellaneous', 0: 'AI', 1: 'Technology', 2: 'Society', 3: 'Robots, Jobs and Automation', 4: 'Climate Change', 5: 'Reddit', 6: 'Food', 7: 'Elon and Trump', 8: 'Renewable Energy', 9: 'Self Driving Cars', 10: 'Cryptocurrency', 11:'People', 12: 'Life Science', 13: 'Science Fiction', 14: 'Nuclear Energy', 15: 'Science Fiction', 16: 'Gender', 17: 'World Politics and Guns', 18: 'Aliens', 19: 'Income', 20: 'Covid Research', 21: 'Life Science', 22: 'Space Travel'}

In [28]:
df['Topics'] = df['selftext'].apply(lambda x: topic_model.find_topics(x)[0][0])
df['Self Labelled Topics'] = df['Topics'].apply(lambda x: topics_to_labels[x])
#save to csv
df.to_csv('Labelled_TM_Futurology_submissions.csv')

In [29]:
labels = pd.DataFrame()
labels[['selftext', 'flair_text', 'Self Labelled Topics']] = df[['selftext', 'flair_text', 'Self Labelled Topics']]
labels.to_csv('LabelsAndFlairsFuturology.csv')