In [6]:
!pip3 install emoji
!pip3 install transformers
!pip3 install torch

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [7]:
import pandas as pd
import emoji
import re
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'http\S+|www\S+', ' LINK ', text)
    text = re.sub(r'@\S+', ' MENTION ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [8]:
input_file_name = 'reddit_clean.csv'
ouput_file_name = 'reddit_labeled.csv'

In [9]:
import pandas as pd

reddit_data = pd.read_csv(input_file_name)

selected_columns = ['id', 'title', 'selftext', 'subreddit', 'permalink']
selected_data = reddit_data[selected_columns].copy()
selected_data['label'] = 0

sampled_data = selected_data.sample(n=50)

sampled_data.to_csv(ouput_file_name, index=False)

In [10]:
labeled_data = pd.read_csv(ouput_file_name)
labeled_data.head()


Unnamed: 0,id,title,selftext,subreddit,permalink,label
0,15wh5a1,fires,they should hire people 24/7 in forest around ...,KelownaBC,https://www.reddit.com/r/KelownaBC/comments/15...,0
1,15vvis1,‘we are an army out there’: west kelowna fire ...,,KelownaBC,https://www.reddit.com/r/KelownaBC/comments/15...,0
2,15vtdzk,fire progression from 15-19th august. nasa fir...,,KelownaBC,https://www.reddit.com/r/KelownaBC/comments/15...,0
3,15u8bf1,it's very very bad,right now i am on top of knox. it looks very v...,KelownaBC,https://www.reddit.com/r/KelownaBC/comments/15...,0
4,15u7jfa,time laps of west kelowna fire. view from lake...,be safe everyone.,KelownaBC,https://www.reddit.com/r/KelownaBC/comments/15...,0


In [12]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.functional import softmax

labeled_data = pd.read_csv('reddit_labeled.csv')
unlabeled_data = pd.read_csv('reddit_clean.csv')
unlabeled_data['label'] = 0

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        selftext = self.data.iloc[idx]['selftext']
        text = title if isinstance(title, str) else ''  + ' ' + selftext if isinstance(selftext, str) else ''
        label = self.data.iloc[idx]['label']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }

seed_dataset = labeled_data.sample(n=min(len(labeled_data), 50))
train_dataset = CustomDataset(seed_dataset, tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

unlabeled_dataset = CustomDataset(unlabeled_data, tokenizer, max_length=512)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=False)

model.eval()
uncertainty_scores = []
for batch in unlabeled_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
    probs = softmax(logits, dim=1)
    uncertainty = torch.max(probs, dim=1).values
    uncertainty_scores.extend(uncertainty.cpu().numpy())

uncertainty_scores = torch.tensor(uncertainty_scores)
selected_indices = uncertainty_scores.argsort(descending=True)[:50]
selected_indices = selected_indices[selected_indices < len(unlabeled_data)]
selected_examples = unlabeled_data.iloc[selected_indices]

selected_examples.to_csv('next.csv', index=False)

Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 13.1MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 25.9kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 553kB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [00:15<00:00, 28.5MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
