# Scrape data, label it, and write it to file

In [17]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementNotInteractableException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
import json

# Setup ChromeDriver
chrome_options = Options()
# Additional options can be added if needed
service = Service(executable_path=ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Define the URL to scrape
url = "https://www.readtangle.com/archive/"

# Navigate to the URL
driver.get(url)

# Wait for the page to load
WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

# Click "Load more posts" button until all articles are loaded
while True:
    try:
        load_more_button = WebDriverWait(driver, 5).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, 'button.js-load-more'))
        )
        driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
        time.sleep(1)  # Let any animations finish
        load_more_button.click()
        time.sleep(2)  # Wait for more articles to load
    except (TimeoutException, NoSuchElementException, ElementNotInteractableException):
        print("No more 'Load more posts' button found or timeout reached.")
        break

articles_data = []

# Find all article links on the page
article_links = driver.find_elements(By.CSS_SELECTOR, 'article a')

# Function to scrape article data
def scrape_article_data(article_url):
    # Open new tab
    driver.execute_script("window.open('');")
    # Switch to the new tab
    driver.switch_to.window(driver.window_handles[1])
    driver.get(article_url)

    # Wait for the article to load
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))

    # Scrape the article title and text sections here
    title = driver.find_element(By.CSS_SELECTOR, 'h1').text.strip()
    
    # Your existing function to get text after h3 until hr
    # Ensure get_texts_after_h3_until_hr function is defined outside the loop
    texts_by_section = {h3_id: get_texts_after_h3_until_hr(h3_id) for h3_id in h3_ids}

    # Close the current tab
    driver.close()
    # Switch back to the first tab
    driver.switch_to.window(driver.window_handles[0])

    return {"title": title, "texts_by_section": texts_by_section}

# IDs of the h3 elements to scrape texts for
h3_ids = [
    "todays-topic",
    "what-the-right-is-saying",
    "what-the-left-is-saying"
]

for link in article_links:
    article_url = link.get_attribute('href')
    article_data = scrape_article_data(article_url)
    articles_data.append(article_data)

    # No need to go back since we are not leaving the main page

# Process and save the scraped data
json_data = json.dumps(articles_data, indent=4)
print(json_data)

# Optionally, save to a file
# with open('articles_data.json', 'w') as f:
#     f.write(json_data)

# Close the driver
driver.quit()


No more 'Load more posts' button found or timeout reached.


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [22]:
with open('articles_data.json', 'w') as f:
    f.write(json_data)

In [None]:
# check we have all (most) of the expected titles (most recent & oldest few match up)
parsed_json = json.loads(json_data)

for e in parsed_json:
    print(e['title'])

In [37]:
articles_data = json.loads(articles_data)

In [45]:
# Assuming `articles_data` is loaded from your JSON file
# with open('your_file.json', 'r') as f:
#     articles_data = json.load(f)

labeled_data = []

for article in articles_data:
    for section_key, paragraphs in article["texts_by_section"].items():
        # Combine all paragraphs into a single text, ensuring it reads smoothly.
        combined_text = " ".join(paragraphs).replace("\\u2019", "'").replace("\\u201c", "\"").replace("\\u201d", "\"")
        
        # Assign labels based on section_key
        if section_key == "todays-topic":
            label = "neutral"
        elif section_key == "what-the-right-is-saying":
            label = "right-leaning"
        elif section_key == "what-the-left-is-saying":
            label = "left-leaning"
        
        # Append combined text and label to labeled_data
        labeled_data.append({"text": combined_text, "label": label})

# Now, labeled_data is ready to be used for training. It's also clean and consolidated.
# Optionally, you can write this out to a file for inspection or further processing.
with open('cleaned_labeled_data.json', 'w') as file:
    json.dump(labeled_data, file, indent=4)


### Further Steps for Neural Network Training:

*   **Tokenization and Encoding**: Convert the text into a format understandable by the network, usually involving converting text to sequences of integers representing tokens or words.
    
*   **Splitting Data**: Divide your data into training, validation, and test sets to evaluate the performance of your model accurately.
    
*   **Neural Network Architecture**: Design your neural network architecture. For sentiment analysis, recurrent neural networks (RNNs) or transformers are common choices due to their effectiveness in handling sequential data like text.
    
*   **Training**: Train your neural network on the processed and labeled data.

# Preprocess the data for training, test, and validation

In [5]:
from transformers import AutoModelForSequenceClassification, BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import json
import numpy as np
import torch

# Load the labeled data from file
with open('cleaned_labeled_data.json', 'r') as file:
    labeled_data = json.load(file)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

texts = [article["text"] for article in labeled_data]
labels = [article["label"] for article in labeled_data]

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
labels_enc = label_encoder.fit_transform(labels)

# Tokenize, encode, and pad sequences in the list of texts
max_length = 512  # Define the maximum sequence length for BERT
encoding = tokenizer(texts, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")

input_ids = encoding['input_ids']
attention_masks = encoding['attention_mask']

# Split data into training and validation sets
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels_enc, random_state=42, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, attention_masks, random_state=42, test_size=0.1)

# Ensure all inputs and labels are torch tensors
train_inputs = train_inputs
validation_inputs = validation_inputs
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = train_masks
validation_masks = validation_masks


In [None]:
example = train_inputs[0]
decoded_example = tokenizer.decode(example)
decoded_example

In [7]:
# Define the model
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    problem_type="single_label_classification",
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, inputs, masks, labels):
        self.inputs = inputs
        self.masks = masks
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs[idx].clone().detach(),
            'attention_mask': self.masks[idx].clone().detach(),
            'labels': self.labels[idx].clone().detach()
        }

# Create the dataset
train_dataset = CustomDataset(train_inputs, train_masks, train_labels)
val_dataset = CustomDataset(validation_inputs, validation_masks, validation_labels)


In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import EvalPrediction, TrainingArguments, Trainer
import numpy as np

def compute_metrics(p: EvalPrediction):
    # p.predictions are logits from the model
    preds = np.argmax(p.predictions, axis=1)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_true=p.label_ids, y_pred=preds)
    
    # Calculate precision, recall, and F1-score
    precision, recall, f1, _ = precision_recall_fscore_support(y_true=p.label_ids, y_pred=preds, average='weighted')
    
    # Return a dictionary of metrics
    return {
        'accuracy': accuracy,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",   # Evaluation at the end of each epoch
    save_strategy="epoch",         # Save at the end of each epoch to match evaluation strategy
    load_best_model_at_end=True,   # Load the best model at the end based on metric
    metric_for_best_model="f1"     # Specify the metric to use for loading the best model
)

# Assuming compute_metrics function is defined correctly according to your task
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,  # Passing tokenizer to ensure correct padding
    compute_metrics=compute_metrics  # Define your metric computation function
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [13]:
trainer.train()

  'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.masks[idx], dtype=torch.long),
  'labels': torch.tensor(self.labels[idx], dtype=torch.long)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1271,0.087935,0.969101,0.969063,0.971842,0.969101
2,0.005,0.049727,0.988764,0.988764,0.988764,0.988764
3,0.003,0.032885,0.985955,0.985955,0.98655,0.985955


  'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.masks[idx], dtype=torch.long),
  'labels': torch.tensor(self.labels[idx], dtype=torch.long)
  'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.masks[idx], dtype=torch.long),
  'labels': torch.tensor(self.labels[idx], dtype=torch.long)


TrainOutput(global_step=1200, training_loss=0.1806035109050572, metrics={'train_runtime': 1095.5566, 'train_samples_per_second': 8.76, 'train_steps_per_second': 1.095, 'total_flos': 2525099469935616.0, 'train_loss': 0.1806035109050572, 'epoch': 3.0})

### Results from 1st training attempt
| Epoch | Training Loss | Validation Loss | Accuracy | F1       | Precision | Recall   |
|-------|---------------|-----------------|----------|----------|-----------|----------|
| 1     | 0.127100      | 0.087935        | 0.969101 | 0.969063 | 0.971842  | 0.969101 |
| 2     | 0.005000      | 0.049727        | 0.988764 | 0.988764 | 0.988764  | 0.988764 |
| 3     | 0.003000      | 0.032885        | 0.985955 | 0.985955 | 0.986550  | 0.985955 |


In [15]:
trainer.evaluate()

  'input_ids': torch.tensor(self.inputs[idx], dtype=torch.long),
  'attention_mask': torch.tensor(self.masks[idx], dtype=torch.long),
  'labels': torch.tensor(self.labels[idx], dtype=torch.long)


{'eval_loss': 0.04972740635275841,
 'eval_accuracy': 0.9887640449438202,
 'eval_f1': 0.9887640449438202,
 'eval_precision': 0.9887640449438202,
 'eval_recall': 0.9887640449438202,
 'eval_runtime': 11.1647,
 'eval_samples_per_second': 31.886,
 'eval_steps_per_second': 4.031,
 'epoch': 3.0}

# Inference

In [47]:
text = "The case against Israel is morally bankrupt."

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [48]:
logits = outputs.logits
logits.shape

torch.Size([1, 3])

In [49]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['right-leaning']
