In [None]:
!pip3 install emoji
!pip3 install transformers
!pip3 install torch

In [2]:
input_file_name = 'reddit_filter.csv'
ouput_file_name = 'reddit_labeled.csv'

In [3]:
import pandas as pd

reddit_data = pd.read_csv(input_file_name)

selected_data = reddit_data.copy()
selected_data['label'] = 0

sampled_data = selected_data.sample(n=50)

sampled_data.to_csv(ouput_file_name, index=False)

In [5]:
labeled_data = pd.read_csv(ouput_file_name)
labeled_data.head()

Unnamed: 0,id,title,selftext,subreddit,permalink,label
0,14eo63g,williams lake fire department to practice hazm...,,WilliamsLakeNews,https://www.reddit.com/r/WilliamsLakeNews/comm...,0
1,132ppze,update: wildfire burning in the chilcotin west...,,WilliamsLakeNews,https://www.reddit.com/r/WilliamsLakeNews/comm...,0
2,137u4hk,parks canada prescribed burn in banff becomes ...,,canada,https://www.reddit.com/r/canada/comments/137u4...,0
3,15bndew,big building fire in east van,looks like it's somewhere near guelph park. ho...,vancouver,https://www.reddit.com/r/vancouver/comments/15...,0
4,15hbe5d,vancouver police investigate stanley park fire...,,vancouver,https://www.reddit.com/r/vancouver/comments/15...,0


In [6]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.functional import softmax

labeled_data_0 = pd.read_csv('reddit_labeled_0.csv')
labeled_data_1 = pd.read_csv('reddit_labeled_1.csv')
labeled_data = pd.concat([labeled_data_0, labeled_data_1])
unlabeled_data = pd.read_csv('reddit_clean.csv')
unlabeled_data['label'] = 0

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        selftext = self.data.iloc[idx]['selftext']
        text = title if isinstance(title, str) else ''  + ' ' + selftext if isinstance(selftext, str) else ''
        label = self.data.iloc[idx]['label']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }

seed_dataset = labeled_data.sample(n=min(len(labeled_data), 50))
train_dataset = CustomDataset(seed_dataset, tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

model.train()
for batch in train_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['label']
    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

unlabeled_dataset = CustomDataset(unlabeled_data, tokenizer, max_length=512)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=False)

model.eval()
uncertainty_scores = []
for batch in unlabeled_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
    probs = softmax(logits, dim=1)
    uncertainty = torch.max(probs, dim=1).values
    uncertainty_scores.extend(uncertainty.cpu().numpy())

uncertainty_scores = torch.tensor(uncertainty_scores)
selected_indices = uncertainty_scores.argsort(descending=True)[:50]
selected_indices = selected_indices[selected_indices < len(unlabeled_data)]
selected_examples = unlabeled_data.iloc[selected_indices]

selected_examples.to_csv('next.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


: 

: 