In [None]:
from dotenv import load_dotenv
import os
import pandas as pd
load_dotenv()
import pickle as pk
import numpy as np
from pathlib import Path
base_dir = os.getenv('BASEDIR')

from tqdm import tqdm
tqdm.pandas()

import scattertext as st
import re

import spacy

In [None]:
dataset='qanda'

In [None]:
dataset_name='qanda'
data = pd.read_csv(os.path.join(base_dir,'data','01_raw_data','qanda','qanda_emoji.csv'), dtype=str, names=['tid', 'rid' ,'qid', 'uid','name', 'bio', 'date', 'text_extended', 'text', 'hashtags', 'hashtags_extended', 'mentions', 'mentions_extended', 'urls_extended', 'urls'], index_col=False)

data['text'] = data['text'].fillna('').apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))
data['bio'] = data['bio'].fillna('').apply(lambda t: re.sub(r'http\S+', '', t)).apply(lambda t: re.sub("#[A-Za-z0-9_]+","", t)).apply(lambda t: re.sub(r'@\S+', '', t))
data['name'] = data['name'].fillna('')

In [None]:
# data = data[(data['rid'].isna() | (data['rid'] == '0'))]

In [None]:
data_per_user = data.groupby('uid').progress_apply(lambda d: pd.DataFrame({'name' : [d['name'].values[0]], 'bio' : [d['bio'].values[0]], 'text' : [' '.join(d['text'])] }))

In [None]:
data_per_user = data_per_user.reset_index(drop=False).set_index('uid')[['name','bio','text']]

In [None]:
inpath = os.path.join(base_dir,'data','04_results','labels',dataset+'_extended.csv')

In [None]:
labels_df = pd.read_csv(inpath, dtype=str).set_index('uid')[['lr_label', 'fr_label']]

In [None]:
mask = labels_df.fr_label.map({'True':True, 'False':False})

In [None]:
labels_df['fr_label'][mask] = 'FR'
labels_df['fr_label'][~mask] = 'M'

In [None]:
df = pd.merge(data_per_user,labels_df, left_index=True, right_index=True, how='left')

In [None]:
category_col = 'lr_label'
category = 'L'
category_name='Left'
not_category_name='Right'

# category_col = 'fr_label'
# category = 'M'
# category_name='Moderate'
# not_category_name='Far-Right'

width=1000

In [None]:
if category_col == 'lr_label':
    # sampled_df = df[df.lr_label != 'N'].groupby('lr_label').sample(n=700, random_state=123)
    sampled_df = df[df.lr_label != 'N']
else:
    sampled_df = df.groupby('fr_label').sample(n=660, random_state=123)

In [None]:
import scattertext as st

moral_foundations_feats = st.FeatsFromMoralFoundationsDictionary()

corpus = st.CorpusFromPandas(sampled_df,
                             category_col=category_col,
                             text_col='text',
                             nlp=st.whitespace_nlp_with_sentences,
                             feats_from_spacy_doc=moral_foundations_feats).build()

html = st.produce_frequency_explorer(
    corpus,
    category=category,
    category_name=category_name,
    not_category_name=not_category_name,
    # metadata=sampled_df.index,
    use_non_text_features=True,
    use_full_doc=True,
    term_scorer=st.CohensD(corpus).use_metadata(),
    grey_threshold=0,
    width_in_pixels=width,
    topic_model_term_lists=moral_foundations_feats.get_top_model_term_lists(),                
    metadata_descriptions=moral_foundations_feats.get_definitions(),
    # return_scatterplot_structure=True,
    save_svg_button=True,
)

open('./html/'+category_col+'/mft_scattertext.html', 'w').write(html)

In [None]:
sampled_df['text'] = sampled_df['text'].apply(lambda t: t[:1000000])
# Work-around for spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
corpus = st.CorpusFromPandas(sampled_df,
                             category_col=category_col,
                             text_col='text',
                             nlp=nlp).build()

In [None]:
html = st.produce_scattertext_explorer(corpus,
    category=category,
    category_name=category_name,
    not_category_name=not_category_name,
    # metadata=sampled_df.index,
    width_in_pixels=width,
    save_svg_button=True,
)

In [None]:
open('./html/'+category_col+'/term_scattertext.html', 'w').write(html)

In [None]:
# import pytextrank, spacy
# import scattertext as st

# nlp = spacy.load('en_core_web_sm')
# nlp.add_pipe("textrank", last=True)

# sampled_df['parse'] = sampled_df.text.progress_apply(nlp)

# corpus = st.CorpusFromParsedDocuments(
#     sampled_df,
#     category_col=category_col,
#     parsed_col='parse',
#     feats_from_spacy_doc=st.PyTextRankPhrases()
# ).build(
# ).compact(
#     st.AssociationCompactor(2000, use_non_text_features=True)
# )

# term_category_scores = corpus.get_metadata_freq_df('')

# term_ranks = np.argsort(np.argsort(-term_category_scores, axis=0), axis=0) + 1

# category_specific_prominence = term_category_scores.apply(
#     lambda r: r.L if r.L > r.R else -r.R,
#     axis=1
# )

# html = st.produce_scattertext_explorer(
#     corpus,
#     category='L',
#     not_category_name='R',
#     minimum_term_frequency=0,
#     pmi_threshold_coefficient=0,
#     width_in_pixels=1000,
#     transform=st.dense_rank,
#     metadata=corpus.get_df().index,
#     scores=category_specific_prominence,
#     sort_by_dist=False,
#     use_non_text_features=True,
#     topic_model_term_lists={term: [term] for term in corpus.get_metadata()},
#     topic_model_preview_size=0,
#     # metadata_descriptions=metadata_descriptions,
#     use_full_doc=True,
#     save_svg_button=True,
# )

# open('./phrase_scattertext.html', 'w').write(html)

In [None]:
import nltk

import scattertext as st
from scattertext.termranking import OncePerDocFrequencyRanker

In [None]:
nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
sampled_df['parse'] = (sampled_df.name + sampled_df.bio + sampled_df.text).progress_apply(nlp)

In [None]:
corpus = st.CorpusFromParsedDocuments(
    sampled_df,
    category_col=category_col,
    parsed_col='parse',
    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

In [None]:
html = st.produce_scattertext_explorer(
    corpus,
    category=category,
    category_name=category_name,
    not_category_name=not_category_name,
    use_full_doc=True,
    term_ranker=OncePerDocFrequencyRanker,
    sort_by_dist=False,
    metadata=sampled_df.index,
    width_in_pixels=width,
    save_svg_button=True,
)

In [None]:
open('./html/'+category_col+'/emoji_scattertext_v2.html', 'w').write(html)

In [None]:
feat_builder = st.FeatsFromOnlyEmpath()

In [None]:
corpus = st.CorpusFromParsedDocuments(
    # sampled_df,
    df[df.lr_label != 'N'],
    category_col=category_col,
    parsed_col='text',
    feats_from_spacy_doc=feat_builder
).build()

In [None]:
html = st.produce_scattertext_explorer(
    corpus,
    category=category,
    category_name=category_name,
    not_category_name=not_category_name,
    width_in_pixels=width,
    metadata=sampled_df.index,
    use_non_text_features=True,
    use_full_doc=True,
    topic_model_term_lists=feat_builder.get_top_model_term_lists(),
    save_svg_button=True,
)

In [None]:
open('./html/'+category_col+'/empath_scattertext.html', 'w').write(html)

In [None]:
import nltk

import scattertext as st
from scattertext.termranking import OncePerDocFrequencyRanker

In [None]:
nlp = st.tweet_tokenizier_factory(nltk.tokenize.TweetTokenizer())
sampled_df['parse'] = (sampled_df.name + sampled_df.bio + sampled_df.text).progress_apply(nlp)

In [None]:
corpus = st.CorpusFromParsedDocuments(
    sampled_df,
    category_col=category_col,
    parsed_col='parse',
    feats_from_spacy_doc=st.FeatsFromSpacyDocOnlyEmoji()
).build()

In [None]:
from scipy.stats import rankdata, hmean, norm

In [None]:
term_freq_df = corpus.get_term_freq_df()

In [None]:
primary, secondary = 'L', 'R'
primary_freq_name = primary+' freq'
secondary_freq_name = secondary+' freq'

term_freq_df[primary+'_precision'] = term_freq_df[primary_freq_name] * 1./(term_freq_df[primary_freq_name] + term_freq_df[secondary_freq_name])
term_freq_df[primary+'_freq_pct'] = term_freq_df[primary_freq_name] * 1./term_freq_df[primary_freq_name].sum()
term_freq_df[primary+'_hmean'] = term_freq_df.apply(lambda x: (hmean([x[primary+'_precision'], x[primary+'_freq_pct']])
                                                                   if x[primary+'_precision'] > 0 and x[primary+'_freq_pct'] > 0 
                                                                   else 0), axis=1)                                                        
term_freq_df.sort_values(by=primary+'_hmean', ascending=False).iloc[:10]

In [None]:
def scale(ar): 
    return (ar - ar.min()) / (ar.max() - ar.min())

def zero_centered_scale(ar):
    scores = np.zeros(len(ar))
    scores[ar > 0] = scale(ar[ar > 0])
    scores[ar < 0] = -scale(-ar[ar < 0])
    return (scores + 1) / 2.

frequencies_scaled = scale(np.log(term_freq_df.sum(axis=1).values))

In [None]:
from sklearn.linear_model import LogisticRegression
scores = corpus.get_logreg_coefs(category,
                                 LogisticRegression(penalty='l2', C=10, max_iter=10000, n_jobs=-1))
scores_scaled = zero_centered_scale(scores)

html = produce_scattertext_explorer(corpus,
                                    category=category,
                                    category_name=category_name,
                                    not_category_name=not_category_name,
                                    minimum_term_frequency=5,
                                    width_in_pixels=width,
                                    x_coords=frequencies_scaled,
                                    y_coords=scores_scaled,
                                    scores=scores,
                                    sort_by_dist=False,
                                    metadata=sampled_df.index,
                                    x_label='Log frequency',
                                    y_label='L2-Penalized Log Reg Coef')
file_name = './html/'+category_col+'/emoji_scattertext_v3.html'
open(file_name, 'wb').write(html.encode('utf-8'))
# IFrame(src=file_name, width = 1200, height=700)

In [None]:
html = produce_scattertext_explorer(corpus,
                                    category=category,
                                    category_name=category_name,
                                    not_category_name=not_category_name,
                                    minimum_term_frequency=5,
                                    width_in_pixels=width,
                                    x_coords=frequencies_scaled,
                                    y_coords=corpus.get_scaled_f_scores('democrat', beta=0.5),
                                    scores=corpus.get_scaled_f_scores('democrat', beta=0.5),
                                    sort_by_dist=False,
                                    metadata=sampled_df.index,
                                    x_label='Log Frequency',
                                    y_label='Scaled F-Score')
file_name = './html/'+category_col+'/emoji_scattertext_v4.html'
open(file_name, 'wb').write(html.encode('utf-8'))
# IFrame(src=file_name, width = 1200, height=700)