In [1]:
import requests 
import bs4
import praw
from transformers import pipeline
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request
import pandas as pd
import json
import os

REDDIT_CLIENT_ID = json.load(open('./.env.json', 'r'))['REDDIT_CLIENT_ID']
REDDIT_CLIENT_SECRET = json.load(open('./.env.json', 'r'))['REDDIT_CLIENT_SECRET']
REDDIT_PASSWORD = json.load(open('./.env.json', 'r'))['REDDIT_PASSWORD']

KEYWORDS = ["lyft", "uber", "ride", "shryft", "driver", "fryft"]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    password=REDDIT_PASSWORD,
    user_agent="testscript by u/fakebot3",
    username="cgamble_23",
)
print(reddit.user.me())

cgamble_23


In [3]:
distilbert_base_pipeline = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english", truncation=True, max_length=512)

def preprocess(text):
    new_text = []

    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

def twitter_roberta_pipeline(inp):
    text = preprocess(inp)
    encoded_input = tokenizer(text, return_tensors='pt')

    for key in encoded_input:
        encoded_input[key] = encoded_input[key][:, :512]

    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ret = {
        'negative': float(scores[0]),
        'neutral': float(scores[1]),
        'positive': float(scores[2])
    }
    return ret

distilbert_emotion_pipeline = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None, truncation=True, max_length=512)

## use twitter

In [5]:
print(twitter_roberta_pipeline("Lyft is horrible. I hate lyft so much. Lyft is the worst thing ever."))
print(twitter_roberta_pipeline("Lyft is not that good but it's not horrible. It definitely could be improved and isn't as good as Uber."))

{'negative': 0.9841306805610657, 'neutral': 0.012793650850653648, 'positive': 0.003075728192925453}
{'negative': 0.3857213258743286, 'neutral': 0.39866918325424194, 'positive': 0.21560947597026825}


In [None]:
COLLEGES = list(open("/Users/coopergamble/Desktop/usc/clubs/tcg/spring 24/sentiment-analysis/colleges.txt", "r").read().split("\n"))

RESULTS_DICT = {
    'College': [],
    'Distilbert Base Score': [], # +1 is POSITIVE, -1 is NEGATIVE
    'Twitter Roberta Negative': [],
    'Twitter Roberta Neutral': [],
    'Twitter Roberta Positive': [],
    'Distilbert Emotion Sadness': [],
    'Distilbert Emotion Joy': [],
    'Distilbert Emotion Love': [],
    'Distilbert Emotion Anger': [],
    'Distilbert Emotion Fear': [],
    'Distilbert Emotion Surprise': []
}

COMMENTS_DICT = {
    'College': [],
    'Comment': []
}

for college in COLLEGES:
    print("Analyzing", college)
    query= f"{college}%20lyft%20program%20reddit"
    url = 'https://google.com/search?q=' + query 

    result=requests.get(url) 
    
    soup = bs4.BeautifulSoup(result.text, "html.parser") 
    a_tags=soup.find_all('a') 
    
    post_urls = []
    for info in a_tags:
        if "href=\"/url?" not in str(info): continue
        url = str(info).split("href=\"/url?q=")[1].split("&amp")[0]
        if "reddit.com" in url:
            post_urls.append(url)
    
    distilbert_base_score = 0
    twitter_roberta_negative = 0
    twitter_roberta_neutral = 0
    twitter_roberta_positive = 0
    distilbert_emotion_sadness = 0
    distilbert_emotion_joy = 0
    distilbert_emotion_love = 0
    distilbert_emotion_anger = 0
    distilbert_emotion_fear = 0
    distilbert_emotion_surprise = 0
    n_comments = 0

    for url in post_urls:
        try:
            submission = reddit.submission(url=url)

            comments = [c.body for c in submission.comments]

            for comment in comments:

                # added in v1
                if all([keyword not in comment.lower() for keyword in KEYWORDS]): continue
                
                n_comments += 1

                COMMENTS_DICT['College'].append(college)
                COMMENTS_DICT['Comment'].append(comment)

                distilbert_base_result = distilbert_base_pipeline(comment)[0]
                distilbert_base_score += distilbert_base_result['score'] if distilbert_base_result['label'] == 'POSITIVE' else -distilbert_base_result['score']
                
                twitter_roberta_result = twitter_roberta_pipeline(comment)
                twitter_roberta_positive += twitter_roberta_result['positive']
                twitter_roberta_negative += twitter_roberta_result['negative']
                twitter_roberta_neutral += twitter_roberta_result['neutral']

                emotion_scores = distilbert_emotion_pipeline(comment)[0]
                distilbert_emotion_sadness += emotion_scores[0]['score']
                distilbert_emotion_joy += emotion_scores[1]['score']
                distilbert_emotion_love += emotion_scores[2]['score']
                distilbert_emotion_anger += emotion_scores[3]['score']
                distilbert_emotion_fear += emotion_scores[4]['score']
                distilbert_emotion_surprise += emotion_scores[5]['score']
        
        except:
            continue

    if n_comments != 0:
        distilbert_base_score /= n_comments
        twitter_roberta_positive /= n_comments
        twitter_roberta_negative /= n_comments
        twitter_roberta_neutral /= n_comments
        distilbert_emotion_sadness /= n_comments
        distilbert_emotion_joy /= n_comments
        distilbert_emotion_love /= n_comments
        distilbert_emotion_anger /= n_comments
        distilbert_emotion_fear /= n_comments

    RESULTS_DICT['College'].append(college)
    RESULTS_DICT['Distilbert Base Score'].append(distilbert_base_score)
    RESULTS_DICT['Twitter Roberta Negative'].append(twitter_roberta_negative)
    RESULTS_DICT['Twitter Roberta Neutral'].append(twitter_roberta_neutral)
    RESULTS_DICT['Twitter Roberta Positive'].append(twitter_roberta_positive)
    RESULTS_DICT['Distilbert Emotion Sadness'].append(distilbert_emotion_sadness)
    RESULTS_DICT['Distilbert Emotion Joy'].append(distilbert_emotion_joy)
    RESULTS_DICT['Distilbert Emotion Love'].append(distilbert_emotion_love)
    RESULTS_DICT['Distilbert Emotion Anger'].append(distilbert_emotion_anger)
    RESULTS_DICT['Distilbert Emotion Fear'].append(distilbert_emotion_fear)
    RESULTS_DICT['Distilbert Emotion Surprise'].append(distilbert_emotion_surprise)
    
results_df = pd.DataFrame(RESULTS_DICT)
results_df.to_csv("/Users/coopergamble/Desktop/usc/clubs/tcg/spring 24/sentiment-analysis/results.csv", index=False)

comments_df = pd.DataFrame(COMMENTS_DICT)
comments_df.to_csv("/Users/coopergamble/Desktop/usc/clubs/tcg/spring 24/sentiment-analysis/comments.csv", index=False)