In [4]:
import pandas as pd
import emoji
import re
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    if not isinstance(text, str): return ""
    text = text.lower()
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'http\S+|www\S+', ' LINK ', text)
    return text

  "class": algorithms.Blowfish,


In [11]:
import pandas as pd

reddit_data = pd.read_csv('reddit.csv')

selected_columns = ['id', 'title', 'selftext', 'subreddit', 'permalink']
selected_data = reddit_data[selected_columns].copy()
selected_data['title'] = selected_data['title'].apply(clean_text)
selected_data['selftext'] = selected_data['selftext'].apply(clean_text)
selected_data['permalink'] = 'https://www.reddit.com' + selected_data['permalink']

selected_data['label'] = 0

sampled_data = selected_data.sample(n=50)

selected_data.to_csv('labeled.csv', index=False)

In [12]:
labeled_data = pd.read_csv('labeled.csv')
labeled_data.head()


Unnamed: 0,id,title,selftext,subreddit,permalink,label
0,160hprs,thought was reading satire,literally thought was on this sub and was wait...,CoronavirusCirclejerk,https://www.reddit.com/r/CoronavirusCirclejerk...,0
1,160hnsx,exploring the intricacies of toronto summer we...,has there been noticeable impact from the wild...,askTO,https://www.reddit.com/r/askTO/comments/160hns...,0
2,160hmzi,what wildfire evacuation notices mean steps to...,,PortlandOregon,https://www.reddit.com/r/PortlandOregon/commen...,0
3,160hh5j,it was violent event fire chief says of wildfi...,,ilovebc,https://www.reddit.com/r/ilovebc/comments/160h...,1
4,160hh4y,really exoticca too soon,,facepalm,https://www.reddit.com/r/facepalm/comments/160...,0


In [22]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.functional import softmax

labeled_data = pd.read_csv('labeled.csv')
unlabeled_data = pd.read_csv('reddit.csv')
unlabeled_data['label'] = 0

# Define BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Define a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        title = self.data.iloc[idx]['title']
        selftext = self.data.iloc[idx]['selftext']
        text = title if isinstance(title, str) else ''  + ' ' + selftext if isinstance(selftext, str) else ''
        label = self.data.iloc[idx]['label']
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'label': label
        }

# Define an active learning loop using uncertainty sampling
seed_dataset = labeled_data.sample(n=99)  # Start with a small seed dataset
train_dataset = CustomDataset(seed_dataset, tokenizer, max_length=128)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

num_iterations = 5  # Number of active learning iterations
for iteration in range(num_iterations):
    # Train the model on the current labeled dataset
    model.train()
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Apply uncertainty sampling to select new examples for labeling
    unlabeled_dataset = CustomDataset(unlabeled_data, tokenizer, max_length=128)
    unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=32, shuffle=False)
    
    model.eval()
    uncertainty_scores = []
    for batch in unlabeled_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        with torch.no_grad():
            logits = model(input_ids, attention_mask=attention_mask).logits
        probs = softmax(logits, dim=1)
        uncertainty = torch.max(probs, dim=1).values
        uncertainty_scores.extend(uncertainty.cpu().numpy())
    
    uncertainty_scores = torch.tensor(uncertainty_scores)
    selected_indices = uncertainty_scores.argsort(descending=True)[:50]
    selected_indices = selected_indices[selected_indices < len(unlabeled_data)]
    print(selected_indices, len(unlabeled_data), max(selected_indices))
    print(unlabeled_data.head())
    print(unlabeled_data.tail())
    selected_examples = unlabeled_data.iloc[selected_indices]
    
    # Add the newly labeled examples to the seed dataset
    seed_dataset = pd.concat([seed_dataset, selected_examples])
    
    # Update the labeled and unlabeled datasets for the next iteration
    labeled_data = seed_dataset
    unlabeled_data = unlabeled_data.drop(index=selected_indices)
    
# Save the updated labeled dataset to next.csv
seed_dataset.to_csv('next.csv', index=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([223,  38,  76,  77,  84,  65, 229, 155,  72,  71,  37,  69,  18, 221,
        165, 168,  22,  66, 203,  58, 192,  20,  24,  47,  21,  53,  30,  42,
         44,  45, 246, 189, 132, 131,  99, 234,  68, 133,  95, 172,  27,  48,
        195, 217, 158, 144,  29, 209, 235, 194]) 248 tensor(246)
                 author  author_flair_text  clicked  comments  created_utc  \
0            locodallas                NaN      NaN       NaN   1692918997   
1  JeanChretieninSpirit                NaN      NaN       NaN   1692918877   
2      RobotTomPeterson                NaN      NaN       NaN   1692918825   
3       PacificProvince                NaN      NaN       NaN   1692918453   
4          floppydisk69                NaN      NaN       NaN   1692918452   

   distinguished  edited       id  is_original_content  is_self  ...  saved  \
0            NaN     NaN  160hprs                  NaN      NaN  ...    NaN   
1            NaN     NaN  160hnsx                  NaN      NaN  ...    Na