# Get HuggingFace Roberta Hate Scoring Tranformer Model

In [1]:
!pip install transformers

Collecting transformers
  Using cached transformers-4.26.1-py3-none-any.whl (6.3 MB)
Collecting regex!=2019.12.17
  Using cached regex-2022.10.31-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (757 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
Collecting huggingface-hub<1.0,>=0.11.0
  Using cached huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
Installing collected packages: tokenizers, regex, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 regex-2022.10.31 tokenizers-0.13.2 transformers-4.26.1


In [2]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

# Preprocess text (username and link placeholders)
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Tasks:
# emoji, emotion, hate, irony, offensive, sentiment
# stance/abortion, stance/atheism, stance/climate, stance/feminist, stance/hillary

task='hate'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

# # TF
# model = TFAutoModelForSequenceClassification.from_pretrained(MODEL)
# model.save_pretrained(MODEL)

# text = "Good night 😊"
# encoded_input = tokenizer(text, return_tensors='tf')
# output = model(encoded_input)
# scores = output[0][0].numpy()
# scores = softmax(scores)

ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/700 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/499M [00:00<?, ?B/s]

1) not-hate 0.9168
2) hate 0.0832


# Query BigQuery

In [3]:
from typing import Union

import google.cloud.aiplatform as vertex_ai
import pandas as pd
from google.cloud import bigquery

In [4]:
bq_client = bigquery.Client()

In [None]:
# Wrapper to use BigQuery client to run query/job, return job ID or result as DF
def run_bq_query(sql: str) -> Union[str, pd.DataFrame]:
    """
    Input: SQL query, as a string, to execute in BigQuery
    Returns the query results as a pandas DataFrame, or error, if any
    """

    # Try dry run before executing query to catch any errors
    job_config = bigquery.QueryJobConfig(dry_run=True, use_query_cache=False)
    bq_client.query(sql, job_config=job_config)

    # If dry run succeeds without errors, proceed to run query
    job_config = bigquery.QueryJobConfig()
    client_result = bq_client.query(sql, job_config=job_config)

    job_id = client_result.job_id

    # Wait for query/job to finish running. then get & return data frame
    df = client_result.result().to_arrow().to_pandas()
    print(f"Finished job_id: {job_id}")
    return df

In [10]:
sql_query = """
SELECT text FROM `fake-news-bears.usa_congress_twitter.tweets`
WHERE text != ""
LIMIT 100;
"""

my_df = run_bq_query(sql_query)

Finished job_id: e65425cf-88b7-4ff6-a39c-dfb5d8c95975


In [11]:
my_df

Unnamed: 0,text
0,ICYMI: House Republicans Release Economic Plan...
1,Happy New Year! No matter the obstacles we fac...
2,Inflation continues to ravage the paychecks of...
3,"Happy Thanksgiving to all celebrating, and a s..."
4,"Americans don't deserve record inflation, high..."
...,...
95,This program is going to be a critical compone...
96,This threatened national and economic security...
97,"When Republicans took power in 2010, they defu..."
98,Insurers are required to cover mental health c...


# Score text using model
Given a dataframe with text, return with additional column scoring hate speech

In [38]:
def get_embedding_df(dataframe):
    encoded_series = dataframe['text'].apply(lambda x: tokenizer(x, return_tensors='pt'))
    features = encoded_series.apply(lambda x: model(**x))
    scores = features.apply(lambda x: x[0][0].detach().numpy())
    scores_softmax = scores.apply(lambda x: softmax(x))
    # https://github.com/cardiffnlp/tweeteval/blob/main/datasets/hate/mapping.txt
    dataframe['non-hate'] = scores_softmax.apply(lambda x: x[0])
    return dataframe
    


get_embedding_df(my_df[:10])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,text,non-hate
0,ICYMI: House Republicans Release Economic Plan...,0.940829
1,Happy New Year! No matter the obstacles we fac...,0.986421
2,Inflation continues to ravage the paychecks of...,0.919753
3,"Happy Thanksgiving to all celebrating, and a s...",0.984058
4,"Americans don't deserve record inflation, high...",0.920001
5,The true impact of inflation cannot be underst...,0.960717
6,border security = national security,0.941848
7,energy security = national security\n\nborder ...,0.960721
8,January 6th was a test of our Constitutional t...,0.976341
9,"The\u00a04,126-page, $1.8 trillion spending sp...",0.96731


In [36]:
df_sanity_check = pd.DataFrame({
'text': ['hate. the the duck is torturing the moose. fuck. it wants the world to burn.', 'love. the cat is loving', 'the goose does nothing']
})
get_embedding_df(df_sanity_check)

0      [0.7853068, 0.21469316]
1    [0.97172135, 0.028278705]
2     [0.89333934, 0.10666071]
Name: text, dtype: object
                                                text  non-hate
0  hate. the the duck is torturing the moose. fuc...  0.785307
1                            love. the cat is loving  0.971721
2                             the goose does nothing  0.893339


Unnamed: 0,text,non-hate
0,hate. the the duck is torturing the moose. fuc...,0.785307
1,love. the cat is loving,0.971721
2,the goose does nothing,0.893339


# Political Sentiment

In [37]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax

MODEL = f"cardiffnlp/xlm-twitter-politics-sentiment"

ps_tokenizer = AutoTokenizer.from_pretrained(MODEL)

# PT
ps_model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/643 [00:00<?, ?B/s]

Downloading (…)ncepiece.bpe.model";:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)"tokenizer.json";:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/975 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

In [42]:
def ps_get_embedding_df(dataframe):
    encoded_series = dataframe['text'].apply(lambda x: ps_tokenizer(x, return_tensors='pt'))
    features = encoded_series.apply(lambda x: ps_model(**x))
    scores = features.apply(lambda x: x[0][0].detach().numpy())
    scores_softmax = scores.apply(lambda x: softmax(x))
    # labels mapping https://github.com/cardiffnlp/tweeteval/blob/main/datasets/sentiment/mapping.txt
    dataframe['negative'] = scores_softmax.apply(lambda x: x[0])
    dataframe['neutral'] = scores_softmax.apply(lambda x: x[1])
    dataframe['positive'] = scores_softmax.apply(lambda x: x[2])
    return dataframe

In [43]:
df_political_sentiment = ps_get_embedding_df(my_df[:10])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == "__main__":


In [44]:
df_political_sentiment

Unnamed: 0,text,negative,neutral,positive
0,ICYMI: House Republicans Release Economic Plan...,0.849561,0.115934,0.034505
1,Happy New Year! No matter the obstacles we fac...,0.005235,0.012865,0.9819
2,Inflation continues to ravage the paychecks of...,0.914557,0.062803,0.02264
3,"Happy Thanksgiving to all celebrating, and a s...",0.004026,0.016981,0.978993
4,"Americans don't deserve record inflation, high...",0.510572,0.350333,0.139095
5,The true impact of inflation cannot be underst...,0.789537,0.18522,0.025243
6,border security = national security,0.240088,0.353584,0.406328
7,energy security = national security\n\nborder ...,0.168986,0.248429,0.582585
8,January 6th was a test of our Constitutional t...,0.033223,0.078125,0.888652
9,"The\u00a04,126-page, $1.8 trillion spending sp...",0.747279,0.213039,0.039683


# Write to BigQuery

In [46]:
sql_query = """
SELECT author_id, created_at, text, id FROM `fake-news-bears.usa_congress_twitter.tweets`
WHERE text != ""
"""

df_input = run_bq_query(sql_query)

Finished job_id: 827faf77-6641-44f4-997f-6b5062b77046


In [47]:
df_input.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49570 entries, 0 to 49569
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype              
---  ------      --------------  -----              
 0   author_id   49570 non-null  int64              
 1   created_at  49570 non-null  datetime64[ns, UTC]
 2   text        49570 non-null  object             
 3   id          49570 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(1)
memory usage: 1.5+ MB


In [48]:
df_input.head(3)

Unnamed: 0,author_id,created_at,text,id
0,1002630999052865536,2023-01-25 19:16:37+00:00,ICYMI: House Republicans Release Economic Plan...,1618326996916011008
1,1002630999052865536,2023-01-01 17:49:18+00:00,Happy New Year! No matter the obstacles we fac...,1609607710986608641
2,1004891731,2022-12-22 15:14:48+00:00,Inflation continues to ravage the paychecks of...,1605944953095454721


In [None]:
# Political Sentiment Scoring
df_political_sentiment = ps_get_embedding_df(df_input)

In [None]:
table_id = "fake-news-bears.teamwork.jyang_political_sentiment"


job_config = bigquery.LoadJobConfig(
    # Specify a (partial) schema. All columns are always written to the
    # table. The schema is used to assist in data type definitions.
    schema=[
        # Specify the type of columns whose type cannot be auto-detected. For
        # example the "title" column uses pandas dtype "object", so its
        # data type is ambiguous.
        bigquery.SchemaField("author_id", bigquery.enums.SqlTypeNames.STRING),
        bigquery.SchemaField("text", bigquery.enums.SqlTypeNames.STRING),
        # Indexes are written if included in the schema by name.
        # bigquery.SchemaField("negative", bigquery.enums.SqlTypeNames.FLOAT),
        # bigquery.SchemaField("neutral", bigquery.enums.SqlTypeNames.FLOAT),
        # bigquery.SchemaField("positive", bigquery.enums.SqlTypeNames.FLOAT),
    ],
    # Optionally, set the write disposition. BigQuery appends loaded rows
    # to an existing table by default, but with WRITE_TRUNCATE write
    # disposition it replaces the table with the loaded data.
    write_disposition="WRITE_TRUNCATE",
)

