<a href="https://colab.research.google.com/github/alex-jk/SWB-GVCEH/blob/main/models/relevance_model/Reddit_Relevancy_Models_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Install and import necessary libraries

In [3]:
!pip install setfit
!pip install tqdm
!pip install gdown



In [4]:
from google.colab import drive
drive.mount('/content/drive')

import gdown
import json
import pandas as pd
import os

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from setfit import SetFitModel
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
import spacy
nlp = spacy.load("en_core_web_sm")

from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import concurrent.futures

import numpy as np
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

model_sent_transformer = SentenceTransformer('all-MiniLM-L6-v2')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
from datasets import load_dataset
from datasets import Dataset, DatasetDict

  and should_run_async(code)


##### Import the full Reddit dataset
- remove duplicates to make sure that posts are unqiue
- TitleText is the text column of interest

In [7]:
file_path = '/content/drive/My Drive/SWB-GVCEH/Complete_Data_v3.json'
reddit_data_df = pd.read_json(file_path)

print(f"\ncd_test shape: {reddit_data_df.shape}")
print(f"\n--------------- Columns: {reddit_data_df.columns}")

select_cols = ['Subreddit', 'Title', 'Text', 'TitleText']
reddit_data_df = reddit_data_df[select_cols]
reddit_data_df.drop_duplicates(inplace=True)
reddit_data_df.reset_index(drop=True, inplace=True)

print("\n------------------------------")
print(reddit_data_df.head())


cd_test shape: (11160, 23)

--------------- Columns: Index(['index', 'Subreddit', 'Title', 'Text', 'TitleText', 'relevance_score',
       'most_common_centroid_id', 'top_terms_from_centroid',
       'topics_from_centroid', 'Score_model2', 'label_model2', 'label_model1',
       'relevant_sentences', 'topic_num', 'Relevant_document',
       'Relevant_topic', 'topic_label', 'Sentiment_Full',
       'Sentence_Level_Sentiment_Compund',
       'Relevent_Sentence_Sentiment_Compund', 'BERT_sentiment_all',
       'BERT_sentiments_relevant_sentences', 'manual_label'],
      dtype='object')

------------------------------
    Subreddit                                              Title  \
0  VictoriaBC  True change around homelessness from the homel...   
1  VictoriaBC  New transitional housing facility on Douglas o...   
2  VictoriaBC   Where to buy a reasonably priced Christmas tree?   
3  VictoriaBC  Looking for a Christmas tree? Please consider ...   
4  VictoriaBC                           

#### Reddit Data Labelling - Model #1

#### Load Twitter Cluster Centroids

In [None]:
file_path_twitter_centroids = "/content/drive/My Drive/SWB-GVCEH/twitter_cluster_centroids.json"

# Read the JSON file
with open(file_path_twitter_centroids, "r") as f:
    centroids_list = json.load(f)

cluster_centroids = np.array(centroids_list)
print(cluster_centroids)

[[ 0.07606746 -0.03684408  0.07050317 ... -0.04477242 -0.05458607
   0.06265755]
 [ 0.03922427 -0.00116647  0.05264552 ... -0.03459585 -0.04934484
   0.0509878 ]
 [ 0.08159573 -0.00581049  0.06172452 ... -0.05318972 -0.0436865
   0.03083117]
 ...
 [ 0.04692682  0.04268746  0.03507463 ... -0.02552383 -0.04925404
   0.03933679]
 [ 0.01194589  0.00625541  0.04830575 ... -0.04271128 -0.0092378
   0.0193102 ]
 [ 0.01649493  0.01140771  0.01363297 ... -0.05939158  0.01116832
   0.01196186]]


##### Functions to process reddit posts

In [None]:
# Function to split text into sentences
def spacy_sentence_split(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

# Function to process a single Reddit post and determine its relevance
def process_post_for_relevance(post, model, centroids, threshold):
    sentences = spacy_sentence_split(post)
    sentence_embeddings = model.encode(sentences, show_progress_bar=False)
    sentence_relevance_scores = cosine_similarity(sentence_embeddings, centroids)
    max_scores = sentence_relevance_scores.max(axis=1)
    max_score_indices = sentence_relevance_scores.argmax(axis=1)
    # print(len(sentence_embeddings))
    # print(max_scores)
    # print(max_score_indices)

    relevant_sentences = max_scores > threshold
    relevant_centroid_ids = max_score_indices[relevant_sentences]
    # print(relevant_centroid_ids)
    relevance_score = relevant_sentences.mean()  # Proportion of sentences above the threshold

    centroid_counter = Counter(relevant_centroid_ids)
    # print(centroid_counter)
    if len(centroid_counter) == 0:
        most_common_centroid_ids = None  # No relevant sentences
    else:
        # Find the maximum count for the centroids
        max_count = centroid_counter.most_common(1)[0][1]
        # Get all centroid IDs that have the maximum count (ties included)
        most_common_centroid_ids = [centroid_id for centroid_id, count in centroid_counter.items() if count == max_count]

    return relevance_score, most_common_centroid_ids

In [None]:
# Parallel processing function
def parallel_relevance_computation(reddit_posts, model, centroids, threshold, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(lambda post: process_post_for_relevance(post, model, centroids, threshold), reddit_posts), total=len(reddit_posts)))

    # Unpack the results into two separate lists
    relevance_scores, most_common_centroid_ids = zip(*results)
    return relevance_scores, most_common_centroid_ids

##### Assign Model #1 labels

In [None]:
def apply_model01_labeling(dataframe, text_column, model, centroids, threshold_score=0.5, label_threshold=0.1):
    """
    Computes relevance scores and centroid IDs for a given DataFrame, then assigns labels based on relevance.

    Parameters:
    - dataframe: pd.DataFrame, the DataFrame to process.
    - titles_column: str, the name of the column containing text to compute relevance for.
    - model: The model used to transform text into embeddings.
    - centroids: np.ndarray, the cluster centroids used for computing relevance.
    - threshold_score: float, the threshold used within the parallel relevance computation.
    - label_threshold: float, the threshold above which the relevance score is considered significant for labeling.

    The function updates the input DataFrame with three new columns: 'relevance_score',
    'most_common_centroid_id', and 'label_model1'.
    """
    # Assuming the implementation of parallel_relevance_computation is available in your environment
    relevance_scores, most_common_centroid_ids = parallel_relevance_computation(
        dataframe[text_column].tolist(), model, centroids, threshold=threshold_score)

    # Assign the computed values to the DataFrame
    dataframe['relevance_score'] = relevance_scores
    dataframe['most_common_centroid_id'] = most_common_centroid_ids

    # Assign labels based on the relevance score threshold
    dataframe['label_model1'] = np.where(dataframe['relevance_score'] >= label_threshold, 1, 0)


In [None]:
# Filter relevant posts based on a relevance score threshold
# reddit_data_df['label_model1'] = np.where(reddit_data_df['relevance_score'] >= 0.1, 1, 0)
apply_model01_labeling(reddit_data_df, 'TitleText', model_sent_transformer, cluster_centroids, 0.5, 0.1)

relevant_reddit_posts = reddit_data_df[reddit_data_df['relevance_score'] > 0.1].reset_index(drop=True)
print(f"\nOriginal number of posts: {len(reddit_data_df.index)}")
print(f"\nNumber of relevant posts: {len(relevant_reddit_posts.index)}")

  0%|          | 0/11151 [00:00<?, ?it/s]


Original number of posts: 11151

Number of relevant posts: 3311


#### SetFit Relevancy Model - Model #2

In [None]:
import joblib

joblib.dump(trainer, '/content/drive/My Drive/SWB-GVCEH/reddit-setfit-model.joblib')

['/content/drive/My Drive/SWB-GVCEH/reddit-setfit-model.joblib']

##### Predict

In [None]:
import joblib

# Load the trainer
trainer_reddit = joblib.load('/content/drive/My Drive/SWB-GVCEH/reddit-setfit-model.joblib')

In [None]:
reddit_posts = reddit_data_df["TitleText"].tolist()

predictions = []

for post in tqdm(reddit_posts, desc="Predicting"):
    prediction = trainer_reddit.model.predict([post])  # Predict for each title individually
    predictions.append(prediction[0])  # Assuming prediction is a list with a single element

# Assign predictions back to your DataFrame
reddit_data_df['label_model2_setfit'] = [p.item() for p in predictions]

Predicting:   0%|          | 0/11151 [00:00<?, ?it/s]

##### Print counts by label

In [None]:
value_counts = reddit_data_df['label_model2_setfit'].value_counts()
print(value_counts)

1
0    10126
1     1025
Name: label_model2_setfit, dtype: int64


##### Reddit posts samples - relevant and not relevant, as classified by SetFit

In [None]:
reddit_data_df.reset_index(drop=True, inplace=True)

In [None]:
select_label = 0
ind = 1

temp_df = reddit_data_df[reddit_data_df['label_model2_setfit']==select_label]
temp_df.reset_index(drop=True, inplace=True)

print( temp_df['TitleText'][ind] )

New transitional housing facility on Douglas opens its doors. 


  and should_run_async(code)


#### Tov2Vec Topic Modeling - Model #3