In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 5.3 MB 23.4 MB/s 
[K     |████████████████████████████████| 7.6 MB 36.2 MB/s 
[K     |████████████████████████████████| 163 kB 43.9 MB/s 
[?25h

In [None]:
!pip3 install emoji==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji==0.6.0
  Downloading emoji-0.6.0.tar.gz (51 kB)
[K     |████████████████████████████████| 51 kB 5.4 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-0.6.0-py3-none-any.whl size=49734 sha256=2d600a6c2e8800d36f5ff7225c16ab21355c0b92701a987b35ffce82d256f147
  Stored in directory: /root/.cache/pip/wheels/4e/bf/6b/2e22b3708d14bf6384f862db539b044d6931bd6b14ad3c9adc
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-0.6.0


In [None]:
from transformers import pipeline
import pandas as pd
import numpy as np
from google.colab import files

In [None]:
'''Load subreddit post data'''
posts_raw_url = 'XXXXXXXXXXXXXXXXXXX'
posts = pd.read_csv(posts_raw_url, index_col=0)
print(posts.head())

  service  tech_flag  score  num_comments  upvote_ratio  \
0    Hulu          0    455            28          0.99   
1    Hulu          1    352            26          0.96   
2    Hulu          0    338            42          0.97   
3    Hulu          0    313            37          0.93   
4    Hulu          0    292            41          0.98   

                                               title  \
0  I’d take a Hotwheels commercial over yet anoth...   
1  Yes, let’s start the episodes 5 mins from the ...   
2  'Futurama' Revival Ordered at Hulu With Origin...   
3             battle of streaming services autoplay.   
4  Hulu Says ‘Prey’ Is Its Biggest Movie or TV Se...   

                                            all text  
0  I’d take a Hotwheels commercial over yet anoth...  
1  Yes, let’s start the episodes 5 mins from the ...  
2  'Futurama' Revival Ordered at Hulu With Origin...  
3           battle of streaming services autoplay..   
4  Hulu Says ‘Prey’ Is Its Bigges

In [None]:
'''Perform sentiment analysis on the titles of each reddit post'''

# Use the original BERT model, trained on Twitter data

# Run model
original_bert = pipeline(model='cardiffnlp/twitter-roberta-base-sentiment')
results = original_bert(posts['title'].to_list())

# Process results
df = pd.DataFrame(columns=['label','score'])
df = df.append(results)
df.columns = ['orig_bert_label', 'orig_bert_score']
df['orig_bert_processed_label'] = df['orig_bert_label'].replace({'LABEL_0':'negative', 'LABEL_1':'neutral', 'LABEL_2':'positive'})
posts['orig_bert_processed_label'] = df['orig_bert_processed_label']
posts['orig_bert_score'] = df['orig_bert_score']

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
# Use the new BERT model, trained on Twitter data

# Run model
new_bert = pipeline(model='cardiffnlp/twitter-roberta-base-sentiment-latest')
results = new_bert(posts['title'].to_list())

# Process results
df = pd.DataFrame(columns=['label','score'])
df = df.append(results)
df.columns = ['new_bert_label', 'new_bert_score']
posts['new_bert_label'] = df['new_bert_label']
posts['new_bert_score'] = df['new_bert_score']

In [None]:
# Use the generic BERT model, trained on Twitter data

# Run model
gen_bert = pipeline(model='Seethal/sentiment_analysis_generic_dataset')
results = gen_bert(posts['title'].to_list())

# Process results
df = pd.DataFrame(columns=['label','score'])
df = df.append(results)
df.columns = ['gen_bert_label', 'gen_bert_score']
df['gen_bert_processed_label'] = df['gen_bert_label'].replace({'LABEL_0':'negative', 'LABEL_1':'neutral', 'LABEL_2':'positive'})
posts['gen_bert_processed_label'] = df['gen_bert_processed_label']
posts['gen_bert_score'] = df['gen_bert_score']

In [None]:
# Convert labels to lowercase to make it easier for processing 
posts['orig_bert_processed_label'] = posts['orig_bert_processed_label'].str.lower()
posts['new_bert_label'] = posts['new_bert_label'].str.lower()
posts['gen_bert_processed_label'] = posts['gen_bert_processed_label'].str.lower()

In [None]:
'''Get the stats for each model'''

# Get the stats for the orignal bert model 

# Get the total number of posts and average score per service, tech flag and label 
orig_stats = posts.groupby(['service', 'tech_flag','orig_bert_processed_label']).agg({'orig_bert_processed_label':'count','orig_bert_score': np.average})
orig_stats = orig_stats.rename(columns={'orig_bert_processed_label':'orig_bert_label_counts', 'orig_bert_score':'orig_bert_avg_score'}).reset_index()

# Get the total number of posts per service and tech flag
totals = orig_stats.groupby(['service','tech_flag']).agg({'orig_bert_label_counts':'sum'}).rename(columns={'orig_bert_label_counts':'orig_total'}).reset_index()

# Compile stats and get percentages 
orig_stats = orig_stats.merge(totals)
orig_stats['orig_pct'] = orig_stats['orig_bert_label_counts']/orig_stats['orig_total']

In [None]:
# Get the stats for the new bert model 

# Get the total number of posts and average score per service, tech flag and label 
new_stats = posts.groupby(['service', 'tech_flag','new_bert_label']).agg({'new_bert_label':'count','new_bert_score': np.average})
new_stats = new_stats.rename(columns={'new_bert_label':'new_bert_label_counts', 'new_bert_score':'new_bert_score_avg_score'}).reset_index()

# Get the total number of posts per service and tech flag
totals = new_stats.groupby(['service','tech_flag']).agg({'new_bert_label_counts':'sum'}).rename(columns={'new_bert_label_counts':'new_total'}).reset_index()

# Compile stats and get percentages 
new_stats = new_stats.merge(totals)
new_stats['new_pct'] = new_stats['new_bert_label_counts']/new_stats['new_total']

In [None]:
# Get the stats for the generic model 

# Get the total number of posts and average score per service, tech flag and label 
gen_stats = posts.groupby(['service', 'tech_flag','gen_bert_processed_label']).agg({'gen_bert_processed_label':'count','gen_bert_score': np.average})
gen_stats = gen_stats.rename(columns={'gen_bert_processed_label':'gen_bert_label_counts', 'generic_bert_score':'gen_bert_score_avg_score'}).reset_index()

# Get the total number of posts per service and tech flag
totals = gen_stats.groupby(['service','tech_flag']).agg({'gen_bert_label_counts':'sum'}).rename(columns={'gen_bert_label_counts':'gen_total'}).reset_index()

# Compile stats and get percentages 
gen_stats = gen_stats.merge(totals)
gen_stats['gen_pct'] = gen_stats['gen_bert_label_counts']/gen_stats['gen_total']

In [None]:
# Combine both stats dataframes 
all_stats = orig_stats.merge(new_stats, left_on=['service','tech_flag', 'orig_bert_processed_label'], right_on=['service','tech_flag', 'new_bert_label'], how='outer')
all_stats = all_stats.merge(gen_stats, left_on=['service','tech_flag', 'orig_bert_processed_label'], right_on=['service','tech_flag', 'gen_bert_processed_label'], how='outer')
all_stats = all_stats.drop(columns=['orig_total','new_total','gen_total'])

In [None]:
# Save results 
all_stats.to_csv('ml_results.csv', encoding = 'utf-8-sig') 
files.download('ml_results.csv')