# Reddit Climate Change - Modeling & Evaluation
Supervision: Prof. Dr. Jan Fabian Ehmke

Group members: Britz Luis, Huber Anja, Krause Felix Elias, Preda Yvonne-Nadine

Time: Summer term 2023 

Data: https://www.kaggle.com/datasets/pavellexyr/the-reddit-climate-change-dataset

In [None]:
#  Topic detection

# LDA
# https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
# http://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

# BERTopic
# https://maartengr.github.io/BERTopic/index.html


In [2]:
# Preparing environment
#%pip install bertopic
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer, util
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.vectorizers import ClassTfidfTransformer
import pandas as pd

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
# Load data
clean_comments = pd.read_csv("data/preprocessed_comments.csv")

  clean_comments = pd.read_csv("data/preprocessed_comments.csv")


In [6]:
#clean_comments.isna()
clean_comments[clean_comments.isna().any(axis=1)]

Unnamed: 0,id,subreddit.name,subreddit.nsfw,created_utc,permalink,sentiment,score,created_date,created_day,created_month,created_year,created_time,body_clean
188548,h6kpauz,polls,False,1.627298e+09,https://old.reddit.com/r/polls/comments/orl7mj...,-0.3612,2.0,2021-07-26,26.0,7.0,2021.0,11:17:44,
188549,Stop climate change and start WW because WW ...,,,,,,,,,,,,
494626,- Targeted cuts to research on climate change,,,,,,,,,,,,
494627,- Openly calling for killing civilians in the ...,,,,,,,,,,,,
518932,You *walk on* a recording of climate change *...,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1376256,.html,,,,,,,,,,,,
1385567,Being skeptical of this is as bad as denying t...,,,,,,,,,,,,
1394370,German school children still write letters to ...,,,,,,,,,,,,
1424093,Edit: a word,,,,,,,,,,,,


In [25]:
clean_comments = clean_comments.dropna(axis=0)

In [8]:
# Convert timestamp because import creates float variables
clean_comments["created_year"] = pd.to_datetime(clean_comments["created_date"]).dt.strftime('%Y')
clean_comments["created_month"] = pd.to_datetime(clean_comments["created_date"]).dt.strftime('%m')
clean_comments["created_day"] = pd.to_datetime(clean_comments["created_date"]).dt.strftime('%d')

In [30]:
clean_comments.head()

Unnamed: 0,id,subreddit.name,subreddit.nsfw,created_utc,permalink,sentiment,score,created_date,created_day,created_month,created_year,created_time,body_clean
0,i79uz1c,oddlyterrifying,False,1651658000.0,https://old.reddit.com/r/oddlyterrifying/comme...,-0.5574,3.0,2022-05-04,4,5,2022,09:58:26,Oh shit there's a new one out? Last one k watc...
1,hz51unj,technews,False,1646280000.0,https://old.reddit.com/r/technews/comments/t53...,0.4588,1.0,2022-03-03,3,3,2022,03:54:08,"We’re never going to reopen those wells, its e..."
2,i3ic64d,worldnews,False,1649177000.0,https://old.reddit.com/r/worldnews/comments/tw...,0.6249,1.0,2022-04-05,5,4,2022,16:36:35,Climate Change is the Great Filter.
3,id3tlo2,ontario,False,1655760000.0,https://old.reddit.com/r/ontario/comments/vglj...,0.296,0.0,2022-06-20,20,6,2022,21:16:31,Climate change also means greater crop yields ...
4,iebulu4,news,False,1656603000.0,https://old.reddit.com/r/news/comments/vo98pd/...,-0.6115,12.0,2022-06-30,30,6,2022,15:26:18,The decline into total destruction by climate ...


## Online Modeling

In [29]:
subset = clean_comments[0:50000]

In [35]:
if 871 in all_docs.index:
    print("Index value 871 is present in the DataFrame index.")
else:
    print("Index value 871 is missing from the DataFrame index.")


Index value 871 is present in the DataFrame index.


In [36]:
# Reset the index of the all_docs DataFrame
all_docs = all_docs.reset_index(drop=True)

# Verify the new index
print(all_docs.index)


RangeIndex(start=0, stop=50000, step=1)


In [37]:
# Prepare documents
#all_docs = subset["body_clean"]
doc_chunks = [all_docs[i:i+1000] for i in range(0, len(all_docs), 1000)]

In [38]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import IncrementalPCA
from bertopic.vectorizers import OnlineCountVectorizer

# Prepare sub-models that support online learning
umap_model = IncrementalPCA(n_components=5)
cluster_model = MiniBatchKMeans(n_clusters=50, random_state=0)
vectorizer_model = OnlineCountVectorizer(stop_words="english", decay=.01)

In [39]:
from bertopic import BERTopic

topic_model = BERTopic(umap_model=umap_model,
                       hdbscan_model=cluster_model,
                       vectorizer_model=vectorizer_model)

# Incrementally fit the topic model by training on 1000 documents at a time
for docs in doc_chunks:
    topic_model.partial_fit(docs)

KeyError: 871

# Modeling Topic Clusters

In [None]:
# remove climate change as word

In [10]:
# Create a subsets for every year
year_groups = clean_comments.groupby(clean_comments['created_year'])

year_dfs = {'comments_{}'.format(year): group for year, group in year_groups}

for year, group in year_groups:
    year_dfs[year] = group

In [40]:
docs_2010 = year_dfs["comments_2010"]["body_clean"].values
docs_2011 = year_dfs["comments_2011"]["body_clean"].values
docs_2012 = year_dfs["comments_2012"]["body_clean"].values
docs_2013 = year_dfs["comments_2013"]["body_clean"].values
docs_2014 = year_dfs["comments_2014"]["body_clean"].values
docs_2015 = year_dfs["comments_2015"]["body_clean"].values
docs_2016 = year_dfs["comments_2016"]["body_clean"].values
docs_2017 = year_dfs["comments_2017"]["body_clean"].values
docs_2018 = year_dfs["comments_2018"]["body_clean"].values
docs_2019 = year_dfs["comments_2019"]["body_clean"].values
docs_2020 = year_dfs["comments_2020"]["body_clean"].values
docs_2021 = year_dfs["comments_2021"]["body_clean"].values
docs_2022 = year_dfs["comments_2022"]["body_clean"].values

In [41]:
docs = (docs_2010,docs_2011,docs_2012,docs_2013,docs_2014,docs_2015,docs_2016,docs_2017,docs_2018,docs_2019,docs_2020,docs_2021,docs_2022)

topics_2010,topics_2011,topics_2012, topics_2013, topics_2014, topics_2015, topics_2016, topics_2017, topics_2018, topics_2019, topics_2020, topics_2012, topics_2022 = None
topics = (topics_2010,topics_2011,topics_2012, topics_2013, topics_2014, topics_2015, topics_2016, topics_2017, topics_2018, topics_2019, topics_2020, topics_2012, topics_2022)

Batches:   0%|          | 0/500 [00:00<?, ?it/s]

2023-05-08 12:14:02,365 - BERTopic - Transformed documents to Embeddings
2023-05-08 12:14:13,684 - BERTopic - Reduced dimensionality
2023-05-08 12:14:14,106 - BERTopic - Clustered reduced embeddings
2023-05-08 12:14:17,964 - BERTopic - Reduced number of topics from 103 to 45


Unnamed: 0,Topic,Count,Name
0,-1,8246,-1_climate_change_people_science
1,0,5394,0_climate_change_global_warming
2,1,172,1_canada_harper_canadian_canadians
3,2,155,2_religious_religion_god_people
4,3,152,3_models_model_predictions_predict
5,4,152,4_population_overpopulation_children_people
6,5,140,5_data_emails_cru_scientists
7,6,99,6_skeptics_skeptic_skeptical_skepticism
8,7,99,7_gore_al_change_climate
9,8,98,8_libertarians_libertarian_libertarianism_market


In [None]:
for d in docs, t in topics:

  # BERT stepwise
  # Step 1 - Extract embeddings
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

  # Step 2 - Reduce dimensionality
  umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

  # Step 3 - Cluster reduced embeddings
  hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

  # Step 4 - Tokenize topics
  vectorizer_model = CountVectorizer(stop_words="english")

  # Step 5 - Create topic representation
  ctfidf_model = ClassTfidfTransformer()

  # All steps together
  t = BERTopic(
    embedding_model=embedding_model,    # Step 1 - Extract embeddings
    umap_model=umap_model,              # Step 2 - Reduce dimensionality
    hdbscan_model=hdbscan_model,        # Step 3 - Cluster reduced embeddings
    vectorizer_model=vectorizer_model,  # Step 4 - Tokenize topics
    ctfidf_model=ctfidf_model,          # Step 5 - Extract topic words
    calculate_probabilities=False,      # Raises speed
    min_topic_size = 300,               # Reduces number of topics
    nr_topics="auto",                    # Reduces number of topics
    verbose=True
  )

  topics, probs = t.fit_transform(d)
  #t.get_topic_info()

In [24]:
# Get specific topic
topic_model.get_topic(0)

[('science', 0.01219470217097759),
 ('people', 0.010584660869951001),
 ('gt', 0.009517132731414605),
 ('don', 0.0086000351186167),
 ('scientific', 0.008597060961067942),
 ('global', 0.00822324026848448),
 ('think', 0.00787297838775973),
 ('scientists', 0.007625308000465114),
 ('evolution', 0.007521849591286694),
 ('warming', 0.007371242329106124)]

In [27]:
# Store topic info in dataframe
doc_info = topic_model.get_document_info(docs)

In [28]:
# Check out document information
doc_info.head()

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,Industrial output --&gt; Increased atmospheric...,-1,-1_people_gt_global_just,people - gt - global - just - science - don - ...,0.0,False
1,This is true but only because Australia lacks ...,-1,-1_people_gt_global_just,people - gt - global - just - science - don - ...,0.0,False
2,"Please, explain to us the whole concept.\n\nNe...",-1,-1_people_gt_global_just,people - gt - global - just - science - don - ...,0.0,False
3,"It hasn't been ""d"" some political types prefer...",1,1_weather_warming_global_snow,weather - warming - global - snow - ice - temp...,1.0,False
4,"&gt; It's called "" "" or as the right wing has ...",1,1_weather_warming_global_snow,weather - warming - global - snow - ice - temp...,1.0,False
