# Anxiety/Confidence Analysis

Steps:
1. Using seed words for the anxiety class and confidence class, use word2vec to find the 1000 most "anxious" and 1000 most "confident" words
2. For each class of documents (either topic, province, or topic/province), calculate the proportion of anxious words used in that class relative to the proportion of anxious words used in total
3. cassssh

In [1]:
from utils import DTYPE, PARSE_DATES, PROV_CONSOLIDATION, CONSOLIDATED_PROVINCES, CONVERTERS, ANCHOR_NAMES, PROVINCE_COLOR_MAP
from tqdm.auto import tqdm
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
import glob
tqdm.pandas()


prov_map = lambda x : x if x not in PROV_CONSOLIDATION else PROV_CONSOLIDATION[x]

total_df = pd.read_csv("../data/processed_data/total_tweet_dataset.csv",header=0,dtype=DTYPE,converters=CONVERTERS,parse_dates=PARSE_DATES)
total_df = total_df.set_index("id").sort_values("created_at")[~total_df.index.duplicated()]

total_df["created_at"] = total_df["created_at"].dt.to_period("D").dt.to_timestamp('s')
total_df["province"] = total_df["province"].apply(prov_map)
total_df = total_df[total_df.clean_text.notnull()]
total_df["province"] = total_df["province"].apply(prov_map)
total_df = total_df[total_df["province"].isin(CONSOLIDATED_PROVINCES)]
print(len(total_df))

  from pandas import Panel


402217


In [56]:
from kaleido.scopes.plotly import PlotlyScope
scope = PlotlyScope()
vis_args = {
    "template": "simple_white",
    "font":{"size": 23},
    "width": 1000
}
vis_args = {
    "template": "simple_white",
    "font":{"size": 23},
    "width": 1000
}

## Bootstrapping

start with seed words for the anxious and confident classes

> We retain the 1,000 terms that are semantically closest, on average, to the anxiety seed words, and the 1,000 terms closest to the confidence seed words.

use word2vec and cosine similarity to build a big vocab of anxious/confident words

word embeddings are derived from: https://github.com/RaRe-Technologies/gensim-data

In [7]:
from gensim.models import KeyedVectors
from text_cleaning import clean_text
import gensim.downloader as api

model = "glove-twitter-25"
anxious_fp = f"../data/external_datasets/{model}-anxious_words.csv"
confident_fp = f"../data/external_datasets/{model}-twitter-confident_words.csv"
# word_vectors = api.load(model)
# word_vectors.save(f"../models/{model}.kv")
word_vectors = KeyedVectors.load(f"../models/{model}.kv", mmap='r')
anxious_seed = ["risk", "threat", "concerned", "doubt", "worry", "fear", "danger", "tension", "stress", "anxious", "upset", "alarming", "worry", "hazard", "uncertain", "scare", "unknown", "alarm", "tense", "anxiety", "distress", "nervous", "risky", "troubled", "threatening", "panic", "fearful", "frighten", "unrest", "doubtful"]
confident_seed = ["ease", "protect", "sure", "security", "safety", "protection", "confidence", "safe", "trust", "hope", "guarantee", "assurance", "certainty", "secure", "confident", "known", "guaranteed", "reassure", "predictable", "quiet", "hopeful", "optimistic", "bold", "convinced", "optimism", "content", "reassurance", "calm", "faithful", "comfort"]
def seed_similarity(word,seed):
    for s in seed: print(word_vectors.similarity(word,seed_word))
    return np.array([word_vectors.similarity(word,seed_word) for seed_word in seed]).mean() 
vocab = pd.DataFrame({"word":word_vectors.vocab.keys()})
vocab["anxious_similarity"] = vocab["word"].progress_apply(lambda x : seed_similarity(x,anxious_seed))
vocab["confident_similarity"] = vocab["word"].progress_apply(lambda x : seed_similarity(x,confident_seed))
vocab["word"] = vocab["word"].progress_apply(clean_text)
vocab = vocab[vocab["word"].astype(bool)]
vocab[["word","anxious_similarity"]].sort_values("anxious_similarity",ascending=False).head(1000).to_csv(anxious_fp)
vocab[["word","confident_similarity"]].sort_values("confident_similarity",ascending=False).head(1000).to_csv(confident_fp)


In [3]:
anxious_values,confident_values = pd.read_csv(anxious_fp,index_col=0)["word"],pd.read_csv(confident_fp,index_col=0)["word"]


Likewise, we have a lexicon of words representing the inverse emotional state, confidence/security.
One of the many benefits of the lexicon expansion approach is that we account for the various ways with which people may express anxiety in natural language. Moreover, using a model trained on examples from real-life social media means that we account for the particular register of discussions taking place on the web.

To devise lexical measures of anxiety, we define the likelihood of a lexicon word appearing in a category of reviews as:

$P(L|c)=\frac{\sum_{c\in L}count(w,c)}{\sum_{c\in C}count(w,c)}$

In [47]:
from utils import ANCHOR_NAMES
text_data = total_df[["clean_text","province","cluster"]]
text_data["clean_text"] = text_data["clean_text"].apply(lambda x : x.split())

def potts_score(text,seed):
    text = text.explode("clean_text")
    text = pd.DataFrame(text.value_counts()).reset_index().set_index("clean_text")
    seed_counts = text[text.index.isin(seed)]
    return float(seed_counts.sum()/text.sum())
    
cluster_anxiety_score,cluster_confidence_score = [],[]
for clus in sorted(total_df["cluster"].unique()):
    iso = text_data[text_data["cluster"]==clus][["clean_text"]]
    anx,conf = potts_score(iso,anxious_values),potts_score(iso,confident_values)
    cluster_anxiety_score.append(anx)
    cluster_confidence_score.append(conf)

cluster_anxiety_score = np.array(cluster_anxiety_score)/sum(cluster_anxiety_score)
cluster_confidence_score = np.array(cluster_confidence_score)/sum(cluster_confidence_score)
cluster_scores = pd.DataFrame({"cluster":sorted(total_df["cluster"].unique()),
                               "cluster_name": ANCHOR_NAMES + [f"Overflow {i+1}" for i in range(5)],
                               "anxiety_score": cluster_anxiety_score,
                              "confidence_score": cluster_confidence_score})

cluster_scores



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[0.25533952179906666, 0.24026300052126806, 0.2751979442704569, 0.31095987024315197, 0.2594132047560671, 0.2543413130938454, 0.2585025642055986, 0.2776377865859827, 0.28973580722637177, 0.2698754955832232, 0.2703150699999233, 0.2664625875620134]


Unnamed: 0,cluster,cluster_name,anxiety_score,confidence_score
0,0,Health Measures,0.0791,0.078613
1,1,Public/Private Schools,0.07443,0.086523
2,2,Childcare,0.085252,0.084353
3,3,Remote Work,0.096331,0.093671
4,4,Remote Learning,0.080362,0.080084
5,5,School Reopenings,0.078791,0.074805
6,6,School Closures,0.08008,0.078423
7,7,Overflow 1,0.086008,0.084605
8,8,Overflow 2,0.089756,0.088185
9,9,Overflow 3,0.083603,0.087847


In [42]:

# seed_counts = text[text["clean_text"].isin(anxious_values)]["clean_text"].unique()
# seed_counts
# text.where(text["clean_text"].isin(anxious_values)).dropna().value_counts()# seed_counts = text.where(text.index.isin(anxious_values)).dropna().value_counts().sum()
# total_counts = text.sum()

(0    27476
 dtype: int64,
 0    99097
 dtype: int64,
 0    0.277264
 dtype: float64)

In [60]:
fig = go.Figure(data=[go.Bar(name='Anxiety Potts Score', x=cluster_scores["cluster_name"], y=cluster_scores["anxiety_score"]),
                      go.Bar(name='Confidence Potts Score', x=cluster_scores["cluster_name"], y=cluster_scores["confidence_score"]),
                     ])
# Change the bar mode
fig.update_layout(barmode='group',**vis_args)
# fp = "../visualizations/sentiment_analysis/anxiety_confidence-topic"
# with open(f"{fp}.pdf", "wb") as f:
#     f.write(scope.transform(fig, format="pdf"))
fig.show()


In [52]:
province_anxiety_score,province_confidence_score = [],[]
for prov in sorted(total_df["province"].unique()):
    iso = text_data[text_data["province"]==prov][["clean_text"]]
    anx,conf = potts_score(iso,anxious_values),potts_score(iso,confident_values)
    province_anxiety_score.append(anx)
    province_confidence_score.append(conf)

province_anxiety_score = np.array(province_anxiety_score)/sum(province_anxiety_score)
province_confidence_score = np.array(province_confidence_score)/sum(province_confidence_score)
province_scores = pd.DataFrame({"province":sorted(total_df["province"].unique()),
                               "anxiety_score": province_anxiety_score,
                              "confidence_score": province_confidence_score})

In [59]:
fig = go.Figure(data=[go.Bar(name='Anxiety Potts Score', x=province_scores["province"], y=province_scores["anxiety_score"]),
                      go.Bar(name='Confidence Potts Score', x=province_scores["province"], y=province_scores["confidence_score"]),
                     ])
# Change the bar mode
fig.update_layout(barmode='group',**vis_args)
# fp = "../visualizations/sentiment_analysis/anxiety_confidence-province"
# with open(f"{fp}.pdf", "wb") as f:
#     f.write(scope.transform(fig, format="pdf"))
fig.show()
