In [126]:
import torch
from transformers import AutoModel, AutoTokenizer, AutoConfig
import tensorflow as tf
from tensorflow import keras
import numpy as np
import pandas as pd
import regex as re
from sklearn.model_selection import train_test_split

In [127]:
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+: 
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [128]:
mode = 'all'

In [129]:
def remove_arabic_chars(text):
    return re.sub("[\p{Arabic}\s]", " ", text)
    #return re.sub(r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD]+', ' ', text)

In [130]:
tweets = pd.read_csv(mode + '_china_full_nort.csv')
tweets = tweets.dropna() # some rows come in as blank so they need to be dropped
tweets.text = tweets.text.apply(remove_arabic_chars)

In [131]:
def batch_tokenize_and_embed(tweets, batch_size = 200):
    
    embeddings = torch.Tensor()
    
    for i in range(0, len(tweets), batch_size):
        batch = tweets[i : min(len(tweets), i+batch_size)]
        print("Processing chunk " + str(i) + " to " + str(i + len(batch)))
        #print(batch[0])
        tokens = tokenizer(batch, padding='max_length', max_length=130, truncation=True, add_special_tokens = False, return_tensors="pt")
        
        with torch.no_grad():
            outputs = bertweet(**tokens)
    
        embeddings = torch.cat((embeddings, outputs.pooler_output)) #pooler_output is an embedding for the entire tweet
        
    return embeddings

In [132]:
def embed_and_store(tweets, chunk_size=1000):
    
    for i in range(0, len(tweets), chunk_size):
        batch = tweets[i : min(len(tweets), i+chunk_size)]
        filename = "embeddings/" + mode + "_china_embedding_" + str(i) + ".pt"
        torch.save(batch_tokenize_and_embed(batch), filename)

In [133]:
embed_and_store(list(tweets.text))

Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
Processing chunk 200 to 400
Processing chunk 400 to 600
Processing chunk 600 to 800
Processing chunk 800 to 1000
Processing chunk 0 to 200
P

In [109]:
a = tokenizer([list(tweets.text)[25436]], padding='max_length', add_special_tokens=True, max_length=130, return_tensors="pt")
bertweet(**a)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.3403,  0.2427,  0.0204,  ...,  0.1723, -0.0912, -0.0112],
         [ 0.4755,  0.0509,  0.3595,  ...,  0.4801,  0.4298, -0.5607],
         [ 0.0320,  0.0301,  0.1053,  ...,  0.3057, -0.2156,  0.2590],
         ...,
         [-0.3403,  0.2427,  0.0204,  ...,  0.1723, -0.0912, -0.0112],
         [-0.3403,  0.2427,  0.0204,  ...,  0.1723, -0.0912, -0.0112],
         [-0.3403,  0.2427,  0.0204,  ...,  0.1723, -0.0912, -0.0112]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[ 3.6026e-01, -2.0943e-01, -6.9925e-02, -1.5168e-01,  3.9827e-02,
          8.7728e-03,  1.8579e-01, -2.2495e-01,  1.7258e-01, -1.2914e-01,
         -1.7045e-01, -1.2448e-01, -1.9424e-01,  4.6071e-02,  2.5059e-01,
         -6.9921e-02, -1.4975e-01,  1.6202e-02, -4.7156e-02,  1.3567e-01,
         -1.3274e-01, -1.1450e-01,  3.6526e-01,  1.2711e-02, -7.4361e-03,
          8.4331e-02, -2.5667e-01,  6.6397e-02,  3.4457e-02, -1.834

In [95]:
list(tweets.text)[25436]

'#ISLAM IS RESPECTED IN #CHINA.  71K VIEWS SO FAR.  PLEASE SHARE, THANK YOU!  #BREAKING #Muslim #Muslims #salamaleikum #MuslimsinChina #HumanRights  #     #      #         #        #      _      #       #      _     _     _    _        #         #    _      #    _   _    https://t.co/PHm4zYiJyk'

In [96]:
list(tweets.text)[1234]

'A journey along the Chinese border – Ili https://t.co/7kPkbOAzZ4 https://t.co/zSuFYhWTM4'