In [1]:
!pip install -q transformers torch
from transformers import pipeline

model_name = "XerOpred/twitter-climate-sentiment-model"
classifier = pipeline('sentiment-analysis', model=model_name)

text = "some power and authority u can not spell, let alone define and wield, thas just more evidence of ur arrogant IGNORANCE same as u apply to ur climate change denial THEORY as if u know shit u do not, TOLD U DareDevil does not mean what the hell u think it does, HELL?? been there"
result = classifier(text)
print(result)

2023-12-06 02:00:53.739133: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[{'label': 'LABEL_1', 'score': 0.7463672161102295}]


In [46]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd

df = pd.read_csv('data/combined-usa.csv' )

model_name = "XerOpred/twitter-climate-sentiment-model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def sentiment_analysis(model, tokenizer, text):
    if not isinstance(text, str):
        raise ValueError("Input text must be a string.")

    # tokenize and get model predictions
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    # extract logits and apply softmax to get probabilities
    logits = outputs.logits
    probabilities = torch.nn.functional.softmax(logits, dim=-1)

    # determine label and its confidence
    predicted_label_idx = torch.argmax(probabilities, dim=1).item()
    confidence = probabilities[0][predicted_label_idx].item()
    labels = model.config.id2label
    predicted_label = labels[predicted_label_idx]

    # sentiment score = positive logit - negative logit
    sentiment_score = logits[0][1] - logits[0][0]

    return predicted_label, confidence, sentiment_score.item(), logits[0].tolist()

def process_in_batches(df, model, tokenizer, batch_size=1000):
    batches = [df[i:i + batch_size] for i in range(0, df.shape[0], batch_size)]

    results = []
    for batch in batches:
        batch_results = batch['Content'].apply(
            lambda x: pd.Series(sentiment_analysis(model, tokenizer, str(x)))
        )
        batch_results.index = batch.index
        results.append(batch_results)
        
    return pd.concat(results)

# Apply the batch processing function
df[['Label', 'Confidence', 'SentimentScore', 'Logits']] = process_in_batches(df, model, tokenizer, batch_size=1000)

# Save the DataFrame to a CSV file
df.to_csv('data/distilbert-sentiment-usa-FINAL.csv', index=False)

# sample_text = "AnnCoulter Global Warming? Climate Change https://t.co/TYleYPslqu Looks like global warming's the trend Now it's climate change, they changed it again These scientists, they get grants from the gov Theyll say anything, or lose that money they love https://t.co/odBcgDMIfp"
# predicted_label, confidence, sentiment_score, logits = sentiment_analysis(model, tokenizer, sample_text)

# print(f"Label: {predicted_label}")
# print(f"Confidence: {confidence}")
# print(f"Sentiment score for the text: {sentiment_score}")
# print(f"Logits: {logits}")

In [54]:
# drop duplicates
duplicates_df = pd.read_csv('data/distilbert-sentiment-usa-FINAL.csv', lineterminator='\n', low_memory=False)

duplicates_df = duplicates_df.drop_duplicates(subset=['Username', 'Content'], keep='first')
duplicates_df.to_csv('data/distilbert-usa.csv', index=False)