## Preparation

In [5]:
import sys
from lib.utils import write_html

!pip install pyarrow pandas arrow swifter plotly gensim spacy openpyxl
!{sys.executable} -m spacy download en_core_web_lg

## 1. Load raw tweet dataset

In [3]:
import pyarrow.feather as ft
data_raw = ft.read_feather('data/raw/raw.feather')
print(f'Total tweets: {data_raw.shape[0]}')
print(f'English tweets: {data_raw[data_raw.lang == "en"].shape[0]} [{data_raw[data_raw.lang == "en"].shape[0] / data_raw.shape[0]}]')

Total tweets: 1315836
English tweets: 1119015 [0.850421329101803]


## 2. Integrate supplementary retrieval

In [17]:
import pyarrow.feather as pf
import arrow
import pandas
import swifter
import plotly.express as px

data_tweetnlp = pf.read_feather('data/raw/en_sent_label_tweetnlp.feather')
data_tweetnlp.created_at = data_tweetnlp.created_at.swifter.apply(
            lambda x: arrow.get(x).datetime.replace(tzinfo=None)
        )
data_tweetnlp.index = data_tweetnlp.created_at
data_tweetnlp = data_tweetnlp.drop(columns=["created_at"])
data_tweetnlp = data_tweetnlp.groupby(['tweetnlp_sentiment_label', pandas.Grouper(freq='M')]).count()
data_tweetnlp = data_tweetnlp.reset_index()
fig = px.line(data_tweetnlp, x='created_at', y='status_id', color='tweetnlp_sentiment_label', labels={'tweetnlp_sentiment_label': '<b>Sentiment</b>', 'status_id': '<b>Number of tweets</b>', 'created_at': '<b>Time</b>'}, template='plotly_white')
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))
write_html(fig, 'en-sentiment-with-tweetnlp')
fig

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

In [8]:
data_tweetnlp.groupby(by=['tweetnlp_sentiment_label']).sum()['status_id']

tweetnlp_sentiment_label
negative    238457
neutral     621257
positive    259301
Name: status_id, dtype: int64

In [9]:
total = 238457+621257+259301
print(238457/total*100)
print(621257/total*100)
print(259301/total*100)

21.309544554809364
55.51820127522866
23.172254169961974


In [2]:
import pyarrow.feather as pf
from gensim.models import LdaModel
from lib.dtc import get_topics, nlp_preprocess
import spacy
import pandas
import swifter
from tqdm.notebook import tqdm
tqdm.pandas()

spacy_nlp = spacy.load('en_core_web_lg')
lda_model = LdaModel.load('data/models/n8/n8.model')
data_tweetnlp = pf.read_feather('data/raw/en_sent_label_tweetnlp.feather')
corpus, dictionary, _ = nlp_preprocess(
                data_tweetnlp["text"].tolist(), spacy_nlp.Defaults.stop_words
            )
data_tweetnlp[
            [
                "topic",
                "topic1",
                "topic2",
                "topic3",
                "topic4",
                "topic5",
                "topic6",
                "topic7",
                "topic8",
            ]
        ] = data_tweetnlp.text.progress_apply(
            lambda t: get_topics(
                t, lda_model, dictionary, spacy_nlp.Defaults.stop_words
            )
        )
data_tweetnlp.to_feather('data/processed/en_sent_label_tweetnlp_topic.feather')

  0%|          | 0/1119015 [00:00<?, ?it/s]

In [7]:
import pyarrow.feather as pf
import arrow
data_tweetnlp = pf.read_feather('data/processed/en_sent_label_tweetnlp_topic.feather')
data_tweetnlp.created_at = data_tweetnlp.created_at.swifter.apply(
            lambda x: arrow.get(x).datetime.replace(tzinfo=None)
        )
data_tweetnlp.index = data_tweetnlp.created_at
data_tweetnlp = data_tweetnlp.drop(columns=["created_at"])
data_tweetnlp = data_tweetnlp.groupby(['tweetnlp_sentiment_label', 'topic', pandas.Grouper(freq='M')]).count()
data_tweetnlp = data_tweetnlp.reset_index()
data_tweetnlp

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Unnamed: 0,tweetnlp_sentiment_label,topic,created_at,user_id,status_id,screen_name,text,source,display_text_width,reply_to_status_id,...,tweetnlp_sentiment_negative,tweetnlp_sentiment_neutral,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8
0,negative,1.0,2007-07-31,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
1,negative,1.0,2007-11-30,1,1,1,1,1,0,0,...,1,1,1,1,1,1,1,1,1,1
2,negative,1.0,2008-01-31,1,1,1,1,1,0,1,...,1,1,1,1,1,1,1,1,1,1
3,negative,1.0,2008-02-29,2,2,2,2,2,0,1,...,2,2,2,2,2,2,2,2,2,2
4,negative,1.0,2008-03-31,2,2,2,2,2,0,0,...,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3044,positive,8.0,2020-01-31,9,9,9,9,9,4,1,...,9,9,9,9,9,9,9,9,9,9
3045,positive,8.0,2020-02-29,7,7,7,7,7,3,2,...,7,7,7,7,7,7,7,7,7,7
3046,positive,8.0,2020-03-31,26,26,26,26,26,9,1,...,26,26,26,26,26,26,26,26,26,26
3047,positive,8.0,2020-04-30,10,10,10,10,10,1,1,...,10,10,10,10,10,10,10,10,10,10


In [10]:
data_tweetnlp[(data_tweetnlp.tweetnlp_sentiment_label == 'positive') & (data_tweetnlp.topic == 1)]

Unnamed: 0,tweetnlp_sentiment_label,topic,created_at,user_id,status_id,screen_name,text,source,display_text_width,reply_to_status_id,...,tweetnlp_sentiment_negative,tweetnlp_sentiment_neutral,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8
2009,positive,1.0,2007-06-30,1,1,1,1,1,0,0,...,1,1,1,1,1,1,1,1,1,1
2010,positive,1.0,2007-07-31,1,1,1,1,1,0,0,...,1,1,1,1,1,1,1,1,1,1
2011,positive,1.0,2007-11-30,10,10,10,10,10,0,0,...,10,10,10,10,10,10,10,10,10,10
2012,positive,1.0,2007-12-31,5,5,5,5,5,0,1,...,5,5,5,5,5,5,5,5,5,5
2013,positive,1.0,2008-01-31,6,6,6,6,6,0,1,...,6,6,6,6,6,6,6,6,6,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2156,positive,1.0,2020-01-31,1191,1191,1191,1191,1191,312,177,...,1191,1191,1191,1191,1191,1191,1191,1191,1191,1191
2157,positive,1.0,2020-02-29,453,453,453,453,453,202,125,...,453,453,453,453,453,453,453,453,453,453
2158,positive,1.0,2020-03-31,532,532,532,532,532,208,106,...,532,532,532,532,532,532,532,532,532,532
2159,positive,1.0,2020-04-30,381,381,381,381,381,169,115,...,381,381,381,381,381,381,381,381,381,381


In [26]:
import plotly.graph_objects as go
fig = go.Figure()
for i in range(1,9):
    fig.add_trace(
        go.Scatter(x=data_tweetnlp[(data_tweetnlp.tweetnlp_sentiment_label == 'positive') & (data_tweetnlp.topic == i)].created_at, y=data_tweetnlp[(data_tweetnlp.tweetnlp_sentiment_label == 'positive') & (data_tweetnlp.topic == i)].status_id, name=f'Topic {i}')
    )
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
), template='plotly_white', title='<b>Number of positive tweets per month</b>')
fig.update_yaxes(title='<b>Number of tweets</b>')
write_html(fig, 'en-sentiment-topic-positive')
fig

In [27]:
import plotly.graph_objects as go
fig = go.Figure()
for i in range(1,9):
    fig.add_trace(
        go.Scatter(x=data_tweetnlp[(data_tweetnlp.tweetnlp_sentiment_label == 'negative') & (data_tweetnlp.topic == i)].created_at, y=data_tweetnlp[(data_tweetnlp.tweetnlp_sentiment_label == 'negative') & (data_tweetnlp.topic == i)].status_id, name=f'Topic {i}')
    )
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
), template='plotly_white', title='<b>Number of negative tweets per month</b>')
fig.update_yaxes(title='<b>Number of tweets</b>')
write_html(fig, 'en-sentiment-topic-negative')
fig

In [9]:
import pyarrow.feather as pf
import arrow
import pandas
import swifter
data_tweetnlp = pf.read_feather('data/processed/en_sent_label_tweetnlp_topic.feather')
data_tweetnlp['tweetnlp_sentiment_score'] = 0 + data_tweetnlp['tweetnlp_sentiment_positive'] - data_tweetnlp['tweetnlp_sentiment_negative']
data_tweetnlp.created_at = data_tweetnlp.created_at.swifter.apply(
            lambda x: arrow.get(x).datetime.replace(tzinfo=None)
        )
data_tweetnlp.index = data_tweetnlp.created_at
data_tweetnlp = data_tweetnlp.drop(columns=["created_at"])
data_tweetnlp = data_tweetnlp.groupby(['topic', pandas.Grouper(freq='M')]).mean()
data_tweetnlp = data_tweetnlp.reset_index()
data_tweetnlp

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Unnamed: 0,topic,created_at,user_id,status_id,display_text_width,reply_to_status_id,reply_to_user_id,is_quote,is_retweet,favorite_count,...,tweetnlp_sentiment_neutral,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,tweetnlp_sentiment_score
0,1.0,2007-05-31,3.379697e+06,7.519302e+07,,,,0.00000,0.000000,0.000000,...,0.722586,0.236072,0.118224,0.076845,0.078521,0.140595,0.146413,0.097288,0.106042,0.243245
1,1.0,2007-06-30,1.013981e+06,9.064882e+07,,,,0.00000,0.000000,0.000000,...,0.285077,0.187254,0.181082,0.053295,0.121652,0.108785,0.136133,0.099108,0.112690,0.690127
2,1.0,2007-07-31,8.041925e+05,1.437403e+08,,1.437494e+08,8.195220e+05,0.00000,0.000000,0.000000,...,0.290286,0.217794,0.176254,0.087059,0.135919,0.095955,0.167254,0.048688,0.071076,-0.212814
3,1.0,2007-09-30,6.219462e+06,2.676662e+08,,,,0.00000,0.000000,0.000000,...,0.803949,0.233795,0.141727,0.100386,0.085906,0.100409,0.181461,0.073278,0.083039,0.051200
4,1.0,2007-10-31,8.454820e+05,3.426918e+08,,,,0.00000,0.000000,0.333333,...,0.815241,0.262913,0.138521,0.075600,0.119473,0.074650,0.159787,0.076852,0.092204,0.059421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1148,8.0,2020-01-31,4.157469e+17,1.217532e+18,107.800000,1.217844e+18,3.344083e+17,0.04878,0.365854,1.170732,...,0.518699,0.163721,0.126756,0.102128,0.090876,0.070040,0.164921,0.051296,0.230262,0.030306
1149,8.0,2020-02-29,3.498468e+17,1.228155e+18,86.142857,1.225475e+18,4.142219e+17,0.00000,0.153846,1.230769,...,0.482717,0.173516,0.129916,0.113092,0.113048,0.055845,0.139188,0.044470,0.230924,0.053772
1150,8.0,2020-03-31,3.406291e+17,1.238411e+18,118.666667,1.237357e+18,3.313137e+17,0.00000,0.657895,2.789474,...,0.521835,0.158069,0.112559,0.104142,0.082837,0.058972,0.189176,0.052480,0.241765,0.209105
1151,8.0,2020-04-30,4.723100e+17,1.250527e+18,106.750000,1.251185e+18,3.149293e+17,0.00000,0.571429,0.380952,...,0.525942,0.143716,0.122751,0.141895,0.105220,0.066013,0.147298,0.041107,0.232001,0.270356


In [15]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=8, cols=1)
for i in range(1,9):
    fig.add_trace(
        go.Bar(x=data_tweetnlp[data_tweetnlp.topic==i].created_at, y=data_tweetnlp[data_tweetnlp.topic==i].tweetnlp_sentiment_score, name=f'<b>Topic {i}</b>'),
        row=i, col=1
    )
fig.update_layout(template='plotly_white', title='<b>Sentiment score of English tweets by topic</b><br>-1: most negative, 1: most positive')
write_html(fig, 'en-sentiment-score-topic')
fig

In [16]:
import pyarrow.feather as pf
import arrow
import pandas
import swifter
data_tweetnlp = pf.read_feather('data/processed/en_sent_label_tweetnlp_topic.feather')
data_tweetnlp.created_at = data_tweetnlp.created_at.swifter.apply(
            lambda x: arrow.get(x).datetime.replace(tzinfo=None)
        )
data_tweetnlp.index = data_tweetnlp.created_at
data_tweetnlp = data_tweetnlp.drop(columns=["created_at"])
data_tweetnlp = data_tweetnlp.groupby(['topic', pandas.Grouper(freq='M')]).count()
data_tweetnlp = data_tweetnlp.reset_index()
data_tweetnlp

Dask Apply:   0%|          | 0/48 [00:00<?, ?it/s]

Unnamed: 0,topic,created_at,user_id,status_id,screen_name,text,source,display_text_width,reply_to_status_id,reply_to_user_id,...,tweetnlp_sentiment_negative,tweetnlp_sentiment_neutral,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8
0,1.0,2007-05-31,29,29,29,29,29,0,0,0,...,29,29,29,29,29,29,29,29,29,29
1,1.0,2007-06-30,1,1,1,1,1,0,0,0,...,1,1,1,1,1,1,1,1,1,1
2,1.0,2007-07-31,2,2,2,2,2,0,1,1,...,2,2,2,2,2,2,2,2,2,2
3,1.0,2007-09-30,4,4,4,4,4,0,0,0,...,4,4,4,4,4,4,4,4,4,4
4,1.0,2007-10-31,3,3,3,3,3,0,0,0,...,3,3,3,3,3,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1148,8.0,2020-01-31,41,41,41,41,41,20,12,16,...,41,41,41,41,41,41,41,41,41,41
1149,8.0,2020-02-29,26,26,26,26,26,14,13,13,...,26,26,26,26,26,26,26,26,26,26
1150,8.0,2020-03-31,76,76,76,76,76,15,7,8,...,76,76,76,76,76,76,76,76,76,76
1151,8.0,2020-04-30,42,42,42,42,42,12,10,13,...,42,42,42,42,42,42,42,42,42,42


In [18]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig = make_subplots(rows=8, cols=1)
for i in range(1,9):
    fig.add_trace(
        go.Bar(x=data_tweetnlp[data_tweetnlp.topic==i].created_at, y=data_tweetnlp[data_tweetnlp.topic==i].status_id, name=f'<b>Topic {i}</b>'),
        row=i, col=1
    )
fig.update_layout(template='plotly_white', title='<b>Number of English tweets per month by topic</b>')
write_html(fig, 'en-topic-per-month')
fig

In [22]:
import spacy
import pandas
import nltk
from nltk.stem import WordNetLemmatizer
from lib.dtc import tokenizer

from typing import Tuple
from collections import Counter
import itertools
spacy_nlp = spacy.load('en_core_web_lg')

nltk.download('omw-1.4')
import pyarrow.parquet as pq
data = pq.read_table('data/raw/en_sent_with_topic8.parquet').to_pandas()
lemmatizer = WordNetLemmatizer()
tweets = data.text.tolist()
words = []
for tweet in tweets:
    tweet = tweet.replace("#", "")
    # words.append([w for w in tokenizer(tweet, spacy_nlp.Defaults.stop_words) if w not in ['23andme', 'ancestrydna']])
    words.append(tokenizer(tweet, spacy_nlp.Defaults.stop_words))

def combine_two(word_list):
    word_list = [w for w in word_list if w and w not in ['23andme', 'ancestrydna']]
    return [' '.join(word_list[i-1:i+1]) for i in range(1, len(word_list))]

grouped_words = [combine_two(w) for w in words]
grouped_words = list(itertools.chain(*grouped_words))
group_freq = Counter(grouped_words)
top25_grouped = [{"term": g[0].__str__(), "count": g[1]} for g in group_freq.most_common(25)]
df_top25_grouped = pandas.DataFrame(top25_grouped).sort_values(by='count', ascending=False)
filtered_words = [word for word in [lemmatizer.lemmatize(word) for word in list(itertools.chain(*words))] if word not in ['23andme', 'ancestrydna']]

word_freq = Counter(filtered_words)
top25_words = [{"term": w[0], "count": w[1]} for w in word_freq.most_common(25)]
df_top25_words = pandas.DataFrame(top25_words).sort_values(by='count', ascending=False)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Bo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [23]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

fig_top25 = make_subplots(rows=1, cols=2, horizontal_spacing=0.15)
fig_top25.add_trace(
    go.Bar(x=df_top25_words['count'], y=df_top25_words['term'], name='<b>Top 25 most frequent word combinations</b>', orientation='h'), row=1, col=1
)
fig_top25.add_trace(
    go.Bar(x=df_top25_grouped['count'], y=df_top25_grouped['term'], name='<b>Top 25 most frequent words</b>', orientation='h'), row=1, col=2
)
fig_top25.update_yaxes(autorange='reversed', tickfont_family="Arial Black")
fig_top25.update_layout(template='seaborn', legend=dict(xanchor='right', yanchor='bottom'))
fig_top25.show()
fig_top25.write_html('output/Fig4_top25terms.html')

## 4. Visualising categorised 'I am' tweets

In [2]:
import pandas
df_iam = pandas.read_excel('data/processed/en_iam.xlsx')
df_iam

Unnamed: 0,status_id,user_id,created_at,text,category1,category2,sub-category1,sub-category2,spacy_sentiment_score,nationality,region
0,15900000000000000,15799569,2010-12-17T23:01:51Z,"don't know what you're doing for #christmas, b...",taking a test,,,,0.000000,False,False
1,11900000000000000,915141,2010-12-06T19:23:41Z,Kind of having 2nd thoughts about my @23andme ...,concerns,,data safety fear,,0.033333,False,False
2,8070000000000000,60014980,2010-11-26T07:54:14Z,@23andMe ok i am sold! Excited & curious now t...,taking a test,excitement,,,0.300000,False,False
3,12200000000000000,16726756,2010-12-07T18:25:53Z,would it not be terrifying to do 23andMe? i'd ...,concerns,,stress,,0.037500,False,False
4,28046353207,13127,2010-10-21T17:34:24Z,"Good to see ppl sharing their 23andme result, ...",concerns,,stress,,0.700000,False,False
...,...,...,...,...,...,...,...,...,...,...,...
1273,1260000000000000000,243050551,2020-05-05T20:40:30Z,figured out how to run my ancestrydna through ...,humour,,,,0.000000,False,False
1274,1260000000000000000,3417424606,2020-05-03T23:12:11Z,if yall were interested in seeing how mixed ra...,taking a test,,curiosity,,0.125000,False,False
1275,1260000000000000000,3417424606,2020-05-02T14:13:11Z,exposing myself for being predominantly white....,ancestry,,mixed,,0.000000,False,False
1276,1260000000000000000,1110000000000000000,2020-05-01T05:31:46Z,out of context this sounds like some rachel do...,unclassified,,,,0.025000,False,False
