# Content analysis
## Set-up, clean-up

# TO DO
- Import data
- Write cleaning function, apply to every df
- Add circle.png as a mask for word cloud + use pyldavis layout&size to plot the topics together
- Make word cloud topic model for Chinese
- Prettify the frequency plot for most used tokens
- Make a pretty frequency plot of language distribution in data 


In [1]:
# import data


In [None]:
# setting up matplotlib settings
# set matplotlib aesthetics
CB91_Blue = '#2CBDFE'
CB91_Green = '#47DBCD'
CB91_Pink = '#F3A0F2'
CB91_Purple = '#9D2EC5'
CB91_Violet = '#661D98'
CB91_Amber = '#F5B14C'

color_list = [CB91_Blue, CB91_Pink, CB91_Green, CB91_Amber,
              CB91_Purple, CB91_Violet]
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=color_list)

import matplotlib.pyplot as plt
import seaborn as sns


sns.set(font='DIN Condensed Bold',
        rc={
            'axes.axisbelow': False,
            'axes.edgecolor': 'lightgrey',
            'axes.facecolor': 'None',
            'axes.grid': False,
            'axes.labelcolor': 'dimgrey',
            'axes.spines.right': False,
            'axes.spines.top': False,
            'figure.facecolor': 'white',
            'lines.solid_capstyle': 'round',
            'patch.edgecolor': 'w',
            'patch.force_edgecolor': True,
            'text.color': 'dimgrey',
            'xtick.bottom': False,
            'xtick.color': 'dimgrey',
            'xtick.direction': 'out',
            'xtick.top': False,
            'ytick.color': 'dimgrey',
            'ytick.direction': 'out',
            'ytick.left': False,
            'ytick.right': False})

sns.set_context("notebook", rc={"font.size":16,
                                "axes.titlesize":20,
                                "axes.labelsize":18})

In [None]:
# cleaning up
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
%pip install yellowbrick
from yellowbrick.text import FreqDistVisualizer, TSNEVisualizer

en_df_n = en_df.set_index('id_str')
token_df = en_df_n.tokens.apply(pd.Series)
en_df_n = pd.concat([en_df_n, token_df], axis=1)
en_df_n.head()

en_df_n.token[0].values()
# Extract tokens as a list
def tokens_to_list(row):
    return row['token'].values()

def lemmas_to_list(row):
    return list(row['lemma'].values())

def clean_tweet(row):
    text = row['text']
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    return text

en_df_n['token_new'] = en_df_n.apply(tokens_to_list, axis=1)
en_df_n['lemma_new'] = en_df_n.apply(lemmas_to_list, axis = 1)
en_df_n['text_clean'] = en_df_n.apply(clean_tweet, axis = 1)

## Visualisations
### Frequent terms

In [None]:
# Visualize frequent terms
# clean text
vect = CountVectorizer(stop_words='english', min_df=10, ngram_range=(1,2))
docs = vect.fit_transform(en_df_n['text_clean'].dropna())
features = vect.get_feature_names()

plt.figure(figsize=(16,10))
visualiser = FreqDistVisualizer(features=features)
visualiser.fit(docs)

visualiser.poof()
plt.show()

### Topic modelling

In [None]:
# pyLDAvis
import gensim
import pyLDAvis.gensim
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer
from gensim import corpora, models, similarities


stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])
    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized


# some additional pre-processing is needed, so I'll use the original tweet-text column from the sample instead
sample_text = en_df_n['text_clean']
state_text = list(sample_text.values)

text_clean = [clean(doc).split() for doc in state_text]
dictionary = corpora.Dictionary(text_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in text_clean]

# I'm keeping the number of topics small so that they would be easier to discern
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics = 8, id2word = dictionary, passes=100)

tfidf = models.TfidfModel(doc_term_matrix)
doc_tfidf = tfidf[doc_term_matrix]
doc_lda = ldamodel[doc_tfidf]

pyLDAvis.enable_notebook()
panel = pyLDAvis.gensim.prepare(ldamodel, doc_lda, dictionary, mds='tsne', sort_topics=True)
panel

In [None]:
# save
#pyLDAvis.save_html(panel, '8_topics_english.html')

#### Word clouds

In [None]:
# lda is assumed to be the variable holding the LdaModel object
%pip install palettable
from wordcloud import WordCloud, STOPWORDS
from palettable.colorbrewer.qualitative import Dark2_8
from palettable.lightbartlein.diverging import BlueDarkOrange18_5
from palettable.matplotlib import Inferno_20
import random
from PIL import Image
import numpy as np

def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    return tuple(Inferno_20.colors[random.randint(1,15)])

font_path = "/Library/Fonts/DIN Condensed Bold.ttf"
icon_path = "utils/circle.png"

mask = np.array(Image.open(icon_path))

for t in range(ldamodel.num_topics):
    plt.figure()
    # wordcloud = WordCloud(background_color="white", max_font_size=60, width=800, height=400)
    wc = WordCloud(font_path=font_path, background_color="white", max_words=2000, mask = mask,
               stopwords=STOPWORDS,
               max_font_size=100, random_state=42,
                  width=800, height=400)
    plt.imshow(wc.fit_words(dict(ldamodel.show_topic(t, 200))).recolor(color_func=color_func, random_state=3))
    #wc.recolor(color_func=color_func)
    plt.axis("off")
    plt.title("Topic #" + str(t+1))
    plt.show()
    #plt.savefig(f"plots/english/en_wordcloud_topic_{t}.png", dpi = 300)
    


## Drafts
### Cluster analysis

NB!! Change to clean text variables; Change HDBScan parameters!

In [None]:
# general prep
real_clean = en_df['text']
corpus_clean = list(real_clean.values)
tfidf = TfidfVectorizer(stop_words='english', min_df=10, ngram_range=(1,2))
docs_clean = tfidf.fit_transform(corpus_clean)

In [None]:
# K-Means
from sklearn.cluster import KMeans

clusters = KMeans(n_clusters=10)
clusters.fit(docs_clean)
plt.figure(figsize=(16,10))
tsne = TSNEVisualizer()
tsne.fit(docs_clean, ["c{}".format(c) for c in clusters.labels_])
tsne.poof()
plt.show()


In [None]:
# HDBScan
import hdbscan

clusterer = hdbscan.HDBSCAN(min_cluster_size=5, gen_min_span_tree=True)
clusterer.fit(docs_clean)
plt.figure(figsize=(16,10))
tsne = TSNEVisualizer()
tsne.fit(docs_clean, ["c{}".format(c) for c in clusterer.labels_])
tsne.poof()
plt.show()