In [None]:
# In case the environment is not set up properly, uncomment and run the following:
# !pip install --user spacy==3.1.6
# !pip install --user coreferee
# !python -m spacy download en_core_web_lg
# !python -m coreferee install en
# !pip install tensorflow
# !pip install negspacy
# !pip install spacy-langdetect
# !pip install sklearn
# !pip install seaborn
# !pip install torch
# !pip install transformers

Let's load all the packages we need:

In [None]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from sklearn.manifold import TSNE
from seaborn import scatterplot

Let's load the dataset for the demo! it contains twitter posts we collected related to climate change. Those tweets are not cleaned

In [None]:
df = pd.read_csv('./data/demo_test_tweets.csv')

Let take a look at the top five tweets from the dataframe:

In [None]:
for tweet in df['full_tweet_text'].loc[:5]:
    print(tweet)
    print()

They look kind of messy! let first clean them up!

In [None]:
%run "./src/preprocessing.py"

In [None]:
df['clean_text'] = df['full_tweet_text'].apply(clean_tweet, args=(False,))

In [None]:
for tweet in df['clean_text'].loc[:5]:
    print(tweet)
    print()

Looks better! let's try it on SpaCy's dependency parser!

In [None]:
nlp = spacy.load("en_core_web_lg")
#add_to_pipe(nlp)
#txt = "The MSM would have you to believe that evidence is overwhelming that manmade climate change is modest and benign, and CO2 emissions are beneficial."
#txt = "World can likely capture and store enough carbon dioxide to meet climate targets but there are many other factors in mitigating climate change"
txt = df['clean_text'].loc[0]
doc = nlp(txt)
displacy.render(doc, style="dep", jupyter=True)


We have created a component that extracts beliefs from a tweet: 

In [None]:
%run "./src/belief_extraction_spacy.py"

In [None]:
nlp = spacy.load("en_core_web_lg")
add_to_pipe(nlp)

def process_text(text):
    doc = nlp(text)
    result = []
    subjects = []
    for b in doc._.beliefs:
        cleaned = b.clean_belief()
        subject = b.clean_subject()
        if len(cleaned) > 0:
            result +=cleaned
            subjects+=subject
    return result,subjects

df['beliefs'] = df['clean_text'].apply(process_text)

Let's look at one of it:

In [None]:
df['beliefs'][0][0]

For each tweet, the belief extraction code will extract subjects and beliefs. 

Next, let's write all beliefs into a dataframe:

In [None]:
belief_dict = {}
belief_dict['belief'] = []
belief_dict['subject'] = []

for r in df['beliefs']:
    if len(r[0]) != 0:
        for b in r[0]:
            belief, subject = b[0], b[1]
            belief_dict['belief'].append(belief.lower())
            belief_dict['subject'].append(subject.lower())

In [None]:
belief_df = pd.DataFrame(belief_dict)

In [None]:
belief_df.head()

Now we have belief texts and the corresponding subject for that belief! in order to quantify beliefs to create the belief landscape, we need to generate sentence embedding for each belief:

In [None]:
%run "./src/embedding/embeddings_climatebert.py"

In [None]:
embeddings = embed_list(belief_df['belief'].values.tolist())

Done! let's look at the belief embeddings:

In [None]:
embeddings

Let's put them into a dataframe with belief and subject texts so we know the corresponding subject for a embedding:

In [None]:
embeddings_df = pd.concat([pd.DataFrame(belief_dict), pd.DataFrame(embeddings)], axis = 1)

In [None]:
embeddings_df.head()

As you may have noticed, each embedding is 768 dimensional. It is a disaster to visualize them! so we will need to do dimension reduction with t-sne: 

In [None]:
from sklearn.decomposition import PCA

#First, run PCA to reduce this to a manageable number of dimensions

pca_reduction = PCA(n_components=30)
pca_results = pca_reduction.fit_transform(embeddings_df.drop(['belief', 'subject'], axis = 1).values)
pca_results


In [None]:
tsne_embedded = TSNE(n_components=2, learning_rate='auto',  init='random', perplexity=50, n_iter=3000).\
    fit_transform(pca_results)

We want to visualize those belief embeddings using a two-dimensional graph, so we choose 2 components for the t-sne.

We want to locate a subject that we are interested. We want to better understand how it is located in the belief landscape:

In [None]:
embeddings_df['subject'].value_counts()

That doesn't give us a lot of information. Let's look at all the subjects:

In [None]:
",".join(embeddings_df['subject'].value_counts().keys())

We may observe that there are a lot of subject related to "climate", e.g. "climate policy". We will look at how those subjects locates in the belief landscape:

In [None]:
scatterplot(x=tsne_embedded[:, 0], y=tsne_embedded[:, 1], hue = embeddings_df['subject'].str.contains("climate"))

There is a pretty tight cluster here, so let's filter those and have a look.

In [None]:
xmin = -12
xmax = -5
ymin = -7.5
ymax = -2.4

tsne_df = pd.DataFrame(tsne_embedded,columns=["x","y"])
embeddings_df['climate'] = embeddings_df['subject'].str.contains("climate")
embeddings_df['hits'] = (tsne_df['x'] > xmin) & (tsne_df['x'] < xmax) & (tsne_df['y'] > ymin) & (tsne_df['y'] < ymax)

In [None]:
scatterplot(x=tsne_df['x'], y=tsne_df['y'], hue = embeddings_df['hits'])

Now we can look at various cuts of the data...

In [None]:
# Tweets about climate in the identified region
embeddings_df[(embeddings_df['climate']==True) & (embeddings_df['hits']==True)]

In [None]:
# Non-climate tweets in the identified region
embeddings_df[(embeddings_df['climate']==False) & (embeddings_df['hits']==True)]

In [None]:
# Climate tweets outside of the identified region
embeddings_df[(embeddings_df['climate']==True) & (embeddings_df['hits']==False)]