<a href="https://colab.research.google.com/github/alex-jk/SWB-GVCEH/blob/main/models/relevance_model/Relevance_Model_Reddit_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from google.colab import drive
#drive.mount('/content/drive')

##### Install necessary libraries

In [6]:
!pip install setfit
!pip install tqdm
!pip install -U sentence-transformers
!pip install --upgrade ipython ipykernel jupyter



##### Import libraries

In [7]:
import pandas as pd
import numpy as np
from setfit import SetFitModel
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
import spacy
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

model_sent_transformer = SentenceTransformer('all-MiniLM-L6-v2')
# Load spaCy model for sentence tokenization
nlp = spacy.load("en_core_web_sm")

model_msmarco = SentenceTransformer('msmarco-MiniLM-L-6-v3')
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
import os
from collections import Counter

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
# Load the pretrained SetFit model
model = SetFitModel.from_pretrained("sheilaflood/gvceh-setfit-rel-model2")

# Example text data
texts = ["Example text relevant to homelessness in Victoria.", "Irrelevant text about other topics."]

# Model makes predictions
predictions = model(texts)
print(predictions)

config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/118 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

model_head.pkl:   0%|          | 0.00/7.48k [00:00<?, ?B/s]

tensor([1, 0])


##### Import reddit datasets

In [None]:
file_path = '/content/drive/My Drive/SWB-GVCEH/VictoriaBC_data_updated.csv'
VictoriaBC_data_nodups = pd.read_csv(file_path)
VictoriaBC_data_nodups = VictoriaBC_data_nodups.drop_duplicates().reset_index(drop=True)

print("\nVictoria BC data ----------------")
print(VictoriaBC_data_nodups.shape)
print(VictoriaBC_data_nodups.columns)
print(VictoriaBC_data_nodups.head())

Sooke_data_nodups = pd.read_csv('/content/drive/My Drive/SWB-GVCEH/Sooke_data.csv')
Sooke_data_nodups = Sooke_data_nodups.drop_duplicates().reset_index(drop=True)

print("\nSooke data ----------------")
print(Sooke_data_nodups.shape)
print(Sooke_data_nodups.columns)
print(Sooke_data_nodups.head())

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/SWB-GVCEH/VictoriaBC_data_updated.csv'

In [None]:
file_url = 'https://drive.google.com/uc?id=1ANE3_UkNi2UGyQpHr8Ujz1QwBNKxmcTl'

# Reading the CSV file into a DataFrame
additional_reddit_df = pd.read_csv(file_url)
additional_reddit_df = additional_reddit_df.drop_duplicates().reset_index(drop=True)
additional_reddit_df.rename(columns={'Body': 'Text'}, inplace=True)

print("\nAdditional reddit data ----------------")
print(additional_reddit_df.shape)
print(additional_reddit_df.columns)
print(additional_reddit_df.head())

  and should_run_async(code)



Additional reddit data ----------------
(2057, 4)
Index(['Subreddit', 'Title', 'Text', 'Comments'], dtype='object')
               Subreddit                                              Title  \
0  OakBayBritishColumbia                            Oak Bay high right now.   
1  OakBayBritishColumbia  Food share for Ukrainian refugees on Vancouver...   
2  OakBayBritishColumbia     Lost Budgie - Cadboro Bay (Willows Elementary)   
3  OakBayBritishColumbia  What are 3 things I must see when visiting Oak...   
4  OakBayBritishColumbia  Yes, this is a leaf blower rant, but hear me o...   

                                                Text  \
0  Anyone know what the hell is going on at Oak B...   
1                                                NaN   
2  My friends blue budgie flew out of the house l...   
3                                            Thanks!   
4  I have a neighbour who uses his leaf blower to...   

                                            Comments  
0  ['https://www

##### Combine all data into one df

In [None]:
# Combining the DataFrames
select_cols = ['Subreddit', 'Title', 'Text']
combined_df = pd.concat([VictoriaBC_data_nodups[select_cols], Sooke_data_nodups[select_cols], additional_reddit_df[select_cols]], ignore_index=True)

# Resetting the index
combined_df.reset_index(drop=True, inplace=True)

combined_df['TitleText'] = combined_df['Title'].fillna('') + '. ' + combined_df['Text'].fillna('')

nan_titletext_df = combined_df[combined_df['TitleText'].isna()]
# Resetting the index
nan_titletext_df.reset_index(drop=True, inplace=True)

# Remove rows where 'TitleText' is NaN
combined_df.dropna(subset=['TitleText'], inplace=True)
# Resetting the index again after dropping rows
combined_df.reset_index(drop=True, inplace=True)

print("\nCombined data ----------------")
print(combined_df.shape)
print(combined_df.columns)
print(combined_df.head())


Combined data ----------------
(11160, 4)
Index(['Subreddit', 'Title', 'Text', 'TitleText'], dtype='object')
    Subreddit                                              Title  \
0  VictoriaBC  True change around homelessness from the homel...   
1  VictoriaBC  New transitional housing facility on Douglas o...   
2  VictoriaBC   Where to buy a reasonably priced Christmas tree?   
3  VictoriaBC  Looking for a Christmas tree? Please consider ...   
4  VictoriaBC                              Free clothing places?   

                                                Text  \
0  I've started my own organization to expose the...   
1                                                NaN   
2  Went to the Christmas tree farm today and thou...   
3                                                NaN   
4  Hey so long story short I moved here from Sask...   

                                           TitleText  
0  True change around homelessness from the homel...  
1  New transitional housing facili

  and should_run_async(code)


In [None]:
%cd ..
!rm -rf SWB-GVCEH

/


##### Push df combined_df_labelled to git

In [None]:
%cd /content
!apt-get install git
!git config --global user.name "alex-jk"
!git config --global user.email "alex.joukova@gmail.com"
!git clone https://github.com/alex-jk/SWB-GVCEH.git
%cd SWB-GVCEH

os.environ['GITHUB_PAT'] = 'ghp_xxx'
# Set your git remote URL to include the PAT for authentication
repo_url = 'https://github.com/alex-jk/SWB-GVCEH.git'  # Replace with your repository's URL
pat = os.environ['GITHUB_PAT']
repo_url_with_token = repo_url[:8] + pat + "@" + repo_url[8:]

!git remote set-url origin {repo_url_with_token}
# Check the current remote URL
!git remote -v
# Navigate to the repository directory, add, commit, and push the new CSV file
%cd /content/SWB-GVCEH

combined_df.to_csv('combined_df_labelled.csv', index=False)

In [None]:
!mv ./combined_df_labelled.csv ./models/relevance_model/

!git add 'models/relevance_model/combined_df_labelled.csv'
!git commit -m "Add combined_df_labelled CSV"
!git push origin main
# !GITHUB_PAT=ghp_xxx git push https://{GITHUB_PAT}@github.com/alex-jk/SWB-GVCEH.git main

# Reset the remote URL to the original without the PAT
!git remote set-url origin {repo_url}
print("CSV file pushed to GitHub.")

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean
Everything up-to-date
CSV file pushed to GitHub.


In [None]:
# ind = 0
# print(nan_titletext_df.iloc[ind])

##### Load twitter data

In [None]:
# List of file names
file_names = [
    'GVCEH-tweets-combined_2023-02-08.csv',
    'GVCEH-tweets-combined_2023-01-30.csv',
    'GVCEH-tweets-combined_2023-01-21.csv',
    'GVCEH-tweets-combined_2023-01-12.csv'
]

# Base URL for raw files in the GitHub repository
base_url = 'https://raw.githubusercontent.com/alex-jk/SWB-GVCEH/main/data/processed/twitter/github_actions/'

# Initialize a list to collect the DataFrames
dfs = []

for file_name in file_names:
    # Construct the full URL for the current file
    file_url = base_url + file_name
    # Read the CSV file
    current_df = pd.read_csv(file_url)
    # Append the DataFrame to the list
    dfs.append(current_df)

# Concatenate all DataFrames in the list
tweets_combined_df = pd.concat(dfs, ignore_index=True)
# Remove duplicates
tweets_combined_df = tweets_combined_df.drop_duplicates().reset_index(drop=True)

# Displaying the first few rows of the DataFrame
print(tweets_combined_df.shape)
print(tweets_combined_df.columns)
print(tweets_combined_df.head())

(5435, 17)
Index(['Unnamed: 0', 'text', 'scrape_time', 'tweet_id', 'created_at',
       'reply_count', 'quote_count', 'like_count', 'retweet_count',
       'geo_full_name', 'geo_id', 'username', 'num_followers',
       'search_keywords', 'search_neighbourhood', 'sentiment', 'score'],
      dtype='object')
   Unnamed: 0                                               text  \
0           0  RT pressjournal: Colonsay islanders and people...   
1           1  Colonsay islanders and people who have left th...   
2           7  @ArianeBurgessHI Serviced plots for 25k are ex...   
3           9  RT @VicBuilders: "25-unit townhome development...   
4          27  @OurNewHomecoach @laughatthemoon2 There is so ...   

                  scrape_time             tweet_id                 created_at  \
0  2023-02-07 03:20:43.040309  1622564995115503616  2023-02-06 11:56:55+00:00   
1  2023-02-07 03:20:43.040317  1622550741599625221  2023-02-06 11:00:16+00:00   
2  2023-02-07 03:20:51.207543  1622549961

##### Check twitter data

In [None]:
ind = 8
print(tweets_combined_df['search_neighbourhood'][ind])
print(tweets_combined_df['search_keywords'][ind])
print(tweets_combined_df['text'][ind])

burnside-gorge OR fairfield-gonzales OR hollywood park OR north park OR pauquachin OR salt spring island OR stadacona park OR victoria
(burnside-gorge OR fairfield-gonzales OR hollywood park OR north park OR pauquachin OR salt spring island OR stadacona park OR victoria) (anawin companion society OR safer victoria OR vtag OR mental health recovery partners, south island OR vancouver island mental health society OR greater victoria acting together OR the mustard seed OR yyj tenants union OR pacifica housing OR solid outreach OR housing OR camper) lang:en -is:retweet
Can you trust @Dave_Eby?One week he says he's not buying in Burnside GorgeThen they announce he (as Housing Minister) "bought a hotel (Capital City Centee) adjacent to Downtown"Adjacent = Burnside Gorge@Adam_Stirling, what do you think?Can Eby be trusted? https://t.co/6yYCcXAvvC @mattdellok @Stephen_Andrew @BC_Housing @Dave_Eby @VictoriaDRA "It would be easier to buy in Burnside Gorge" Minister Eby https://t.co/bSP0LdgPwJ


##### Load pre-trained twitter relevancy model

In [None]:
ind = 7
model(tweets_combined_df['text'][ind])

tensor(1)

##### Check that tweets are considered relevant by the model
- all tweets were found to be relevant by the model

In [None]:
# Define a function to make predictions
def get_prediction(text):
    prediction = model([text])[0]
    return prediction

# # Apply the model to each row in the 'text' column with a progress bar
# tqdm.pandas()  # Enable tqdm for pandas
# tweets_combined_df['relevant'] = tweets_combined_df['text'].progress_apply(get_prediction)

In [None]:
def extract_integer_from_tensor(tensor_val):
    return tensor_val.item()

# Apply this function to the entire column
# tweets_combined_df['relevant'] = tweets_combined_df['relevant'].apply(extract_integer_from_tensor)

# Print value counts for the 'relevant' column
# print(tweets_combined_df['relevant'].value_counts())

#### Method #1
##### Perform K-Means clustering of twitter data
Generate embeddings of twitter posts

In [None]:
twitter_embeddings = model_sent_transformer.encode(tweets_combined_df['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/170 [00:00<?, ?it/s]

##### Run K-Means on twitter embeddings

In [None]:
# # Calculate sum of squared distances for different number of clusters
# Sum_of_squared_distances = []
# K = range(1,40)  # Adjust the range based on your dataset
# for k in K:
#     print(f"Current k: {k}")
#     km = KMeans(n_clusters=k, n_init=10)
#     km = km.fit(twitter_embeddings)
#     Sum_of_squared_distances.append(km.inertia_)

# # Plot the elbow graph
# plt.plot(K, Sum_of_squared_distances, 'bx-')
# plt.xlabel('Number of clusters')
# plt.ylabel('Sum of squared distances')
# plt.title('Twitter Embeddings Elbow Method For Optimal k')
# plt.show()

#####Assign clusters to embeddings

In [None]:
from sklearn.cluster import KMeans

k = 15  # number of clusters
kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
kmeans.fit(twitter_embeddings)

cluster_centroids = kmeans.cluster_centers_

tweets_combined_df['cluster'] = kmeans.labels_

In [None]:
print(tweets_combined_df['cluster'].unique())

ind = 2
cluster_tweets = tweets_combined_df[tweets_combined_df['cluster'] == ind].copy().reset_index(drop=True)
for i in range(0, 10):
  print("\n---------------------Printing tweet")
  print(cluster_tweets['text'][i])

[ 4  5 14 13 12 10  6  9  1 11  0  2  8  3  7]

---------------------Printing tweet
Victoria has some of the lowest rates of funding for public schools in the country... and for public hospitals... and for public housing. 
Notice a trend? https://t.co/4MiGBB31Xo

---------------------Printing tweet
@ianincolwood @JohnsonStBRDG I do kinda think we Victoria people are kinda throwing stones from glass houses. Calgary does build a lot of housing and the trajectory for mass transit is looking good, which I can't say applies to us.

---------------------Printing tweet
@DumbLayman @BirthGauge City of Vancouver and Victoria are fairly similar to their CMA surprisingly (although variations by neighbourhoods).

---------------------Printing tweet
@CityOfVictoria Victoria has a very left wing, anti small business City Council, so be very careful if you’re planning on opening a business here. For example, they underfund the police so crime and beak ins are out of control…

---------------------Pri

In [None]:
# Function to split text into sentences
def spacy_sentence_split(text):
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents]

import concurrent.futures

# Function to process a single Reddit post and determine its relevance
def process_post_for_relevance(post, model, centroids, threshold):
    sentences = spacy_sentence_split(post)
    sentence_embeddings = model.encode(sentences, show_progress_bar=False)
    sentence_relevance_scores = cosine_similarity(sentence_embeddings, centroids)
    max_scores = sentence_relevance_scores.max(axis=1)
    max_score_indices = sentence_relevance_scores.argmax(axis=1)
    # print(len(sentence_embeddings))
    # print(max_scores)
    # print(max_score_indices)

    relevant_sentences = max_scores > threshold
    relevant_centroid_ids = max_score_indices[relevant_sentences]
    # print(relevant_centroid_ids)
    relevance_score = relevant_sentences.mean()  # Proportion of sentences above the threshold

    centroid_counter = Counter(relevant_centroid_ids)
    # print(centroid_counter)
    if len(centroid_counter) == 0:
        most_common_centroid_ids = None  # No relevant sentences
    else:
        # Find the maximum count for the centroids
        max_count = centroid_counter.most_common(1)[0][1]
        # Get all centroid IDs that have the maximum count (ties included)
        most_common_centroid_ids = [centroid_id for centroid_id, count in centroid_counter.items() if count == max_count]

    return relevance_score, most_common_centroid_ids

# Parallel processing function
def parallel_relevance_computation(reddit_posts, model, centroids, threshold, max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = list(tqdm(executor.map(lambda post: process_post_for_relevance(post, model, centroids, threshold), reddit_posts), total=len(reddit_posts)))

    # Unpack the results into two separate lists
    relevance_scores, most_common_centroid_ids = zip(*results)
    return relevance_scores, most_common_centroid_ids

In [None]:
print(combined_df.shape)
check_post = combined_df['TitleText'][11]

print(check_post)

process_post_for_relevance(check_post, model_sent_transformer, cluster_centroids, threshold=0.5)

(11160, 4)
Has BC Housing confirmed that for new supportive housing there will be no screening of prospective tenants for violent offenders and no restrictions on drug supply or use?. [I just read this article; a commentary by an anonymous Victoria councillor where they state that BC Housing confirmed that for new supportive housing there will be no screening of prospective tenants for violent offenders and no restrictions on drug supply or use.](https://www.timescolonist.com/opinion/comment-to-fix-what-ails-the-city-we-need-to-change-course-7342054)

This sounds like a shit fucking idea. I tried looking for a press release or something from BC Housing that can back this up but haven't found anything. 


(0.75, [5, 0, 4])

In [None]:
# Apply parallel relevance computation to the DataFrame
relevance_scores, most_common_centroid_ids = parallel_relevance_computation(combined_df['TitleText'].tolist(), model_sent_transformer, cluster_centroids, threshold=0.5)

# Assign the results to separate columns in the DataFrame
combined_df['relevance_score'] = relevance_scores
combined_df['most_common_centroid_id'] = most_common_centroid_ids

  0%|          | 0/11160 [00:00<?, ?it/s]

In [None]:
# Filter relevant posts based on a relevance score threshold
combined_df['label_model1'] = np.where(combined_df['relevance_score'] >= 0.1, 1, 0)

relevant_reddit_posts = combined_df[combined_df['relevance_score'] > 0.1].reset_index(drop=True)
print(f"\nOriginal number of posts: {len(combined_df.index)}")
print(f"\nNumber of relevant posts: {len(relevant_reddit_posts.index)}")


Original number of posts: 11160

Number of relevant posts: 3312


##### Check relevant posts

In [None]:
ind = 5
print(relevant_reddit_posts['TitleText'][ind])

Who all in Victoria/BC/Federal have power to affect housing crisis?. Not sure how to properly phrase this, but I’m trying to find a list of the people who have at least some power to be able to help the housing and affordability crisis,whether or not they are actually doing so.

Everyone says “send a message to your MLA”, but who else is there?


##### Twitter Relevancy Model on Reddit sentences - Method #2

In [None]:
# Function to chunk a long text into smaller parts
def tokenize_into_sentences(text):
    return sent_tokenize(text)


def predict_relevance(text):
    sentences = tokenize_into_sentences(text)
    if not sentences:  # If no sentences, return default values
        return 0.0, 0

    # Assuming model(sentences) returns a tensor of predictions; convert this to a list
    predictions_tensor = model(sentences)
    predictions = predictions_tensor.tolist()  # Convert tensor to list

    # Calculate the score as a percentage of relevant sentences
    score = sum(predictions) / len(sentences) * 100
    score = round(score, 2)  # Round to 2 decimal places for readability

    # Determine binary output: 1 if 10% or more sentences are relevant, else 0
    binary_output = 1 if score >= 10 else 0

    return score, binary_output

In [None]:
ind = 0
print(combined_df['TitleText'][ind])

print("\n-----------------------")
predict_relevance(combined_df['TitleText'][ind])

True change around homelessness from the homeless leaders with REAL lived experience. I've started my own organization to expose the corruption around homelessness in Victoria BC. Also to educate others of what's truly contributing to poverty and homelessness, what the failures of the system are and what are the solutions, from those who are homeless or overcame homelessness and have the unique experience to really create real change and effect. Giving a new purpose and motivation for those who have experienced poverty all their lives, For us to find a way to have a purpose in our struggle, (Real Lived Experience) which brings true healing. In return brings real change, Which affects the entire community and world as a whole. 
first, awareness and education from the actual homeless need to be inherited, and opportunities can be created from the homless, for the homeless,
Please have a look and share. There is a forum where you can write about your own experiences with corruption in Vic

(63.64, 1)

In [None]:
tqdm.pandas(desc="Processing Posts")

combined_df[['Score_model2', 'label_model2']] = combined_df['TitleText'].progress_apply(
    lambda x: pd.Series(predict_relevance(x))
)

Processing Posts:   0%|          | 0/11160 [00:00<?, ?it/s]

In [None]:
%cd /content
!apt-get install git
!git config --global user.name "alex-jk"
!git config --global user.email "alex.joukova@gmail.com"
!git clone https://github.com/alex-jk/SWB-GVCEH.git
%cd SWB-GVCEH

os.environ['GITHUB_PAT'] = 'ghp_xxx'
# Set your git remote URL to include the PAT for authentication
repo_url = 'https://github.com/alex-jk/SWB-GVCEH.git'  # Replace with your repository's URL
pat = os.environ['GITHUB_PAT']
repo_url_with_token = repo_url[:8] + pat + "@" + repo_url[8:]

!git remote set-url origin {repo_url_with_token}
# Check the current remote URL
!git remote -v
# Navigate to the repository directory, add, commit, and push the new CSV file
%cd /content/SWB-GVCEH

In [None]:
import csv
combined_df.to_csv('combined_df_labelled_v2.csv', index=False, quoting=csv.QUOTE_NONNUMERIC, escapechar="\\")
combined_df.to_json('combined_df_labelled_v2.json', orient='records', lines=True)

In [None]:
!mv ./combined_df_labelled_v2.json ./models/relevance_model/

!git add 'models/relevance_model/combined_df_labelled_v2.json'
!git commit -m "Add combined_df_labelled_v2 json"
!git push origin main

# Reset the remote URL to the original without the PAT
!git remote set-url origin {repo_url}
print("json file pushed to GitHub.")

[main 413022b] Add combined_df_labelled_v2 json
 1 file changed, 11160 insertions(+)
 create mode 100644 models/relevance_model/combined_df_labelled_v2.json
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 2.51 MiB | 3.16 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/alex-jk/SWB-GVCEH.git
   66cda5c..413022b  main -> main
json file pushed to GitHub.


##### Extract relevant sentences

In [None]:
%cd ..
!rm -rf SWB-GVCEH

/content/SWB-GVCEH/models


##### Import the labelled json file

In [None]:
%cd /content
!apt-get install git
!git config --global user.name "alex-jk"
!git config --global user.email "alex.joukova@gmail.com"
!git clone https://github.com/alex-jk/SWB-GVCEH.git
%cd SWB-GVCEH

import os
os.environ['GITHUB_PAT'] = 'ghp_xxx'
# Set your git remote URL to include the PAT for authentication
repo_url = 'https://github.com/alex-jk/SWB-GVCEH.git'  # Replace with your repository's URL
pat = os.environ['GITHUB_PAT']
repo_url_with_token = repo_url[:8] + pat + "@" + repo_url[8:]

!git remote set-url origin {repo_url_with_token}
# Check the current remote URL
!git remote -v

In [None]:
%cd /content/SWB-GVCEH/models/relevance_model

combined_df_labelled_v2 = pd.read_json('combined_df_labelled_v2.json', lines=True)
print(f"\n---- df combined_df_labelled_v2 shape: {combined_df_labelled_v2.shape}")
print(f"\n Columns: {combined_df_labelled_v2.columns}")

/content/SWB-GVCEH/models/relevance_model

---- df combined_df_labelled_v2 shape: (11160, 10)

 Columns: Index(['Subreddit', 'Title', 'Text', 'TitleText', 'relevance_score',
       'most_common_centroid_id', 'Score_model2', 'label_model2',
       'label_model1', 'relevant_sentences'],
      dtype='object')


In [None]:
def extract_relevant_sentences(text):
    sentences = tokenize_into_sentences(text)
    if not sentences:  # If no sentences, return an empty string
        return ""

    predictions = model(sentences)

    # Extract sentences that are marked relevant by the model
    relevant_sentences = [sentence for sentence, prediction in zip(sentences, predictions) if prediction == 1]
    concatenated_sentences = " ".join(relevant_sentences)

    return concatenated_sentences

In [None]:
print("\nCombined data ----------------")
print(combined_df_labelled_v2.shape)
print("\n------")
print(combined_df_labelled_v2.iloc[189])
print("\nCurrent Directory:", os.getcwd())


Combined data ----------------
(11160, 10)

------
Subreddit                                                         VictoriaBC
Text                       [NEW: As requested, a [downloadable PDF](https...
relevance_score                                                     0.021622
most_common_centroid_id                                                 [14]
Score_model2                                                            2.05
label_model2                                                               0
label_model1                                                               0
relevant_sentences         For example, if only 10 out of every 1000 peop...
Name: 189, dtype: object

Current Directory: /content/SWB-GVCEH/models/relevance_model


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [None]:
tqdm.pandas(desc="Extracting relevant sentences")

combined_df_labelled_v2['relevant_sentences'] = combined_df_labelled_v2['TitleText'].progress_apply(extract_relevant_sentences)
combined_df_labelled_v2.to_json('combined_df_labelled_v2.json', orient='records', lines=True)

!git add 'combined_df_labelled_v2.json'
!git commit -m "Add combined_df_labelled_v2 json"
!git push origin main

print("json file pushed to GitHub.")

Extracting relevant sentences:   0%|          | 0/11160 [00:00<?, ?it/s]

[main b5c9f36] Add combined_df_labelled_v2 json
 1 file changed, 11160 insertions(+), 11160 deletions(-)
fatal: could not read Password for 'https://ghp_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@github.com': No such device or address
json file pushed to GitHub.


##### Relevant vs. non-relevant post examples
###### Cosine-similarity model samples

In [None]:
combined_df_labelled_v2[combined_df_labelled_v2['label_model2']==1]['TitleText'].iloc[12]

'BC Housing to convert Vic West property into temporary homeless shelter. '

##### SetFit Relevancy Model
###### Create a subset of data for labelling - random sampling

In [None]:
# Check label counts matrix
confusion_matrix = pd.crosstab(combined_df_labelled_v2['label_model1'], combined_df_labelled_v2['label_model2'], rownames=['Cosine Similarity'], colnames=['Twitter Relevancy Model'], margins=True)
print(confusion_matrix)
print("\n---------")
print(combined_df_labelled_v2[(combined_df_labelled_v2['label_model1']==0) & (combined_df_labelled_v2['label_model2']==1)]['TitleText'].iloc[0])

Twitter Relevancy Model     0     1    All
Cosine Similarity                         
0                        6349  1426   7775
1                        1165  2220   3385
All                      7514  3646  11160

---------
Office clothing donation. *^(K, so I did a search here to see if I could find an answer before making a brand new post but I could not and didn't find it so I apologize in advance if this has been asked a bunch of times!)*

I am looking to donate men's office clothing to an organization here in Victoria but have not been able to decide which one would be most appropriate. I looked for something like Dress for Success but only found one focused on women's wear. The items I am looking to donate (button up shirts, pants, ties) are either new or lightly used (freshly washed). 

I am also taking suggestions for donations of more casual men's clothing, it seems like Our Place would be a good bet?

Thanks a lot!


See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])


In [None]:
reddit_manual_labels_sample01 = pd.read_json('reddit_manual_labels_sample01.json', lines=True)
print(f"\n---- df reddit_manual_labels_sample01 shape: {reddit_manual_labels_sample01.shape}")
print(f"\n Columns: {reddit_manual_labels_sample01.columns}")


---- df reddit_manual_labels_sample01 shape: (110, 11)

 Columns: Index(['Subreddit', 'Title', 'Text', 'TitleText', 'relevance_score',
       'most_common_centroid_id', 'Score_model2', 'label_model2',
       'label_model1', 'relevant_sentences', 'manual_label'],
      dtype='object')


In [None]:
# Create final sample
rows_both_0 = combined_df_labelled_v2[(combined_df_labelled_v2['label_model1'] == 0) & (combined_df_labelled_v2['label_model2'] == 0)].copy().reset_index(drop=True)
rows_both_1 = combined_df_labelled_v2[(combined_df_labelled_v2['label_model1'] == 1) & (combined_df_labelled_v2['label_model2'] == 1)].copy().reset_index(drop=True)

sampled_0 = rows_both_0.sample(n=60, random_state=0) # 42
sampled_1 = rows_both_1.sample(n=50, random_state=0) # 42

final_sample = pd.concat([sampled_0, sampled_1])
final_sample.reset_index(drop=True, inplace=True)

final_sample['manual_label'] = np.nan

columns_to_match = ['TitleText']
final_sample = pd.merge(final_sample, reddit_manual_labels_sample01[columns_to_match], on=columns_to_match, how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1).reset_index(drop=True)

print(f"\nFinal sample shape: {final_sample.shape}")
print("\n-----------------")
print(final_sample.head())


Final sample shape: (109, 11)

-----------------
          Subreddit                                              Title  \
0        VictoriaBC            Motorcycle stolen right out of driveway   
1  SaanichPeninsula  Saanich’s famous Tuxedo Drive Christmas displa...   
2        VictoriaBC                                    Worst Uber Ever   
3   britishcolumbia  Did anyone listen to MOVE 103.5 FM this mornin...   
4  SaanichPeninsula  BC Rugby - Vikes Split Victories With UBCOB Ra...   

                                                Text  \
0  [Stolen Cafe-racer style motorcycle](http://im...   
1                                               None   
2                                               None   
3  I usually catch the Nat and Drew show on a dai...   
4                                               None   

                                           TitleText  relevance_score  \
0  Motorcycle stolen right out of driveway. [Stol...              0.0   
1  Saanich’s famous Tu

See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  return np.find_common_type(types, [])
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)
  common = np.find_common_type([values.dtype, comps_array.dtype], [])


In [None]:
print("\nCurrent Directory:", os.getcwd())
%cd /content/SWB-GVCEH/models/relevance_model


Current Directory: /content/SWB-GVCEH/models/relevance_model
/content/SWB-GVCEH/models/relevance_model


In [None]:
final_sample.to_json('reddit_data_sample.json', orient='records', lines=True)

!git add 'reddit_data_sample.json'
!git commit -m "Add reddit_data_sample json"
!git push origin main

print("json file pushed to GitHub.")

[main 32cae71] Add reddit_data_sample json
 1 file changed, 109 insertions(+), 110 deletions(-)
 rewrite models/relevance_model/reddit_data_sample.json (95%)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 31.44 KiB | 6.29 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/alex-jk/SWB-GVCEH.git
   f2ee472..32cae71  main -> main
json file pushed to GitHub.


##### Manual labelling

In [None]:
ind = 110
print("\n------")
print(final_sample[['label_model1', 'label_model2', 'manual_label']].iloc[ind])
print("\n------")
print(final_sample['TitleText'].iloc[ind])

final_sample.at[ind, 'manual_label'] = 0

print("\n------")
print(final_sample[['label_model1', 'label_model2', 'manual_label']].iloc[ind])


------


IndexError: single positional indexer is out-of-bounds

In [None]:
counts = final_sample['manual_label'].value_counts()
print(counts)

0.0    78
1.0    32
Name: manual_label, dtype: int64


In [None]:
!git pull --rebase origin main

From https://github.com/alex-jk/SWB-GVCEH
 * branch            main       -> FETCH_HEAD
Rebasing (1/1)[KSuccessfully rebased and updated refs/heads/main.


In [None]:
final_sample.to_json('reddit_manual_labels_sample01.json', orient='records', lines=True)

!git add 'reddit_manual_labels_sample01.json'
!git commit -m "Add reddit_manual_labels_sample01 json"
!git push origin main

print("json file pushed to GitHub.")

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (5/5), done.
Writing objects: 100% (5/5), 32.44 KiB | 5.41 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/alex-jk/SWB-GVCEH.git
   11f31bd..f2ee472  main -> main
json file pushed to GitHub.


##### Import train and test data
Import labelled json data

In [12]:
%cd /content/SWB-GVCEH/models/relevance_model

reddit_manual_labels_sample01 = pd.read_json('reddit_manual_labels_sample01.json', lines=True)
print(f"\n---- df reddit_manual_labels_sample01 shape: {reddit_manual_labels_sample01.shape}")
print(f"\n Columns: {reddit_manual_labels_sample01.columns}")

print(f"\n-----------------------")
print(reddit_manual_labels_sample01.head())

/content/SWB-GVCEH/models/relevance_model

---- df reddit_manual_labels_sample01 shape: (110, 11)

 Columns: Index(['Subreddit', 'Title', 'Text', 'TitleText', 'relevance_score',
       'most_common_centroid_id', 'Score_model2', 'label_model2',
       'label_model1', 'relevant_sentences', 'manual_label'],
      dtype='object')

-----------------------
    Subreddit                                              Title  \
0  VictoriaBC  Why the hell is the Bay Street Wendy’s always ...   
1  VictoriaBC       STOLEN MOUNTAIN BIKE Trek Lush SL 29 size XS   
2  VictoriaBC  Walking With Bruce. This is why I live in Bren...   
3  VictoriaBC  Statistically (based on long-term averages) to...   
4  VictoriaBC  Ma Millers Pub confirms it will be closing for...   

                                                Text  \
0  Every time I drive by the entire parking lot i...   
1                                               None   
2                                               None   
3             

  via input_request


##### import json train and test

In [13]:
import gdown

url = 'https://drive.google.com/uc?id=1-JyXzU-DfD_VLTnrGckfhpBoaS7q55go'
output = 'filename.extension'  # Replace 'filename.extension' with the appropriate filename and extension for your file
gdown.download(url, output, quiet=False)