In [1]:
#!pip install transformers

In [None]:
import numpy as np
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as ppb
import warnings
warnings.filterwarnings('ignore')

## Importing the dataset
We'll use pandas to read the dataset and load it into a dataframe.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
rus_df = pd.read_csv('drive/MyDrive/colab_input/all_rus_tweets.csv')
usa_df = pd.read_csv('drive/MyDrive/colab_input/all_usa_tweets.csv')

In [None]:
int(len(usa_df[usa_df.Country=='GERMANY']))

102

## Loading the Pre-trained BERT model
Let's now load a pre-trained BERT model. 

In [None]:
# For DistilBERT:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=267967963.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
batch = list(rus_df[(rus_df.Country=='ARGENTINA') & (rus_df.lang=='en')].clean_text.str.lower().dropna())


In [None]:
batch

['what do football ️ + jazz have in common? saransk world cup ambassador igor butman tells us:',
 'lavrov met with heads of diplomatic missions of latin american and caribbean countries accredited in moscow',
 '️ peace for nagorno - karabakh : ceasefire , termination of all hostilities and russian peacekeepers:',
 'lavrov : our talks were very useful and substantive. argentina is one of russia ’s key partners in latin america',
 'within last 24 hours, mod_russia held 4 humanitarian actions. 1170 syrian people received humanitarian_aid ️',
 'take a look at international womens day greetings from russian diplomats️ “8марта | iwd2017”',
 'zakharova : on february 17-19, lavrov will attend the 53rd munich security conference msc2017',
 "take a look at beautiful world through russian diplomats' eyes! diplo photo",
 'file photo: ambassador karlov accompanies president putin at ataturk airport in istanbul , turkey , october 10, 2016',
 'zakharova : on august 9, the post-registration clinical t

In [None]:
def make_BERT_average(df,network_level,file_name):
    countries = list(df.Country.unique())
    CLS_means_list = []
    for country in tqdm(countries):
        batch = pd.DataFrame(df[df.Country==country].clean_text.str.lower()).dropna()
        total_batches = int(len(batch)/100)
        batch_array = np.ones([1,768])
        for idx in tqdm(range(total_batches)):
            start_row = idx * 100
            end_row = (idx+1) * 100
            sub_batch = batch.iloc[start_row:end_row]
            tokenized = sub_batch['clean_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
            max_len = 0

            for i in tokenized.values:
                if len(i) > max_len:
                    max_len = len(i)

            padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
            attention_mask = np.where(padded != 0, 1, 0)
            input_ids = torch.tensor(padded)  
            attention_mask = torch.tensor(attention_mask)

            with torch.no_grad():
                last_hidden_states = model(input_ids, attention_mask=attention_mask)
            
            features = last_hidden_states[0][:,0,:].numpy()
            batch_array = np.concatenate([batch_array,features])
        
        batch_array = batch_array[1:]
        x = batch_array.mean(0)
        CLS_means_list.append(x)
    dff = pd.DataFrame(CLS_means_list)
    dff['country'] = countries
    dff['network'] = network_level
    return dff.to_csv(file_name)

In [None]:
#make_BERT_average(rus_df,'RUS','drive/MyDrive/colab_input/BERT_sentence_averages_RUS.csv')

In [None]:
#make_BERT_average(usa_df,'USA','drive/MyDrive/colab_input/BERT_sentence_averages_USA.csv')

In [None]:
dff_usa = pd.read_csv('drive/MyDrive/colab_input/BERT_sentence_averages_USA.csv')
dff_usa = dff_usa.dropna()
dff_rus = pd.read_csv('drive/MyDrive/colab_input/BERT_sentence_averages_RUS.csv')
dff_rus = dff_rus.dropna()
dff = pd.concat([dff_usa,dff_rus])

In [None]:
country = 'GERMANY'

batch = pd.DataFrame(usa_df[usa_df.Country==country].clean_text.str.lower()).dropna()
tokenized = batch['clean_text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)

features = last_hidden_states[0][:,0,:].numpy()
x = features.mean(0)
dff_just_germany = pd.DataFrame(x).T
dff_just_germany['country'] = 'GERMANY'
dff_just_germany['network'] = 'USA'
#dff_usa = pd.concat([dff_usa,dff_just_germany]).sort_values(by='country')
dff_usa = dff_usa.rename(columns=dict(zip(list(dff_usa.columns)[1:-2],list(range(768)))))
dff_usa = dff_usa[list(dff_usa.columns)[1:]]
dff_usa = pd.concat([dff_usa,dff_just_germany])
dff_rus = dff_rus.rename(columns=dict(zip(list(dff_rus.columns)[1:-2],list(range(768)))))
dff_rus = dff_rus[list(dff_rus.columns)[1:]]
bert_df = pd.concat([dff_usa,dff_rus])

In [None]:
bert_df.to_csv('drive/MyDrive/colab_input/BERT_sentence_averages.csv')