In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [2]:
import matplotlib.pyplot as plt
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 20

plt.rc('font', size=BIGGER_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=BIGGER_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=BIGGER_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=BIGGER_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=BIGGER_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [3]:
import statsmodels.formula.api as smf
import scipy.stats
from scipy.stats import ttest_ind
from scipy.stats import f_oneway

In [4]:
df_x = pd.read_csv("Data/df_x_nb0a-groupMerge.csv", index_col = 0)
print(df_x.shape)

(171634, 26)


In [5]:
df_x['message_dt'] = pd.to_datetime(df_x["message_dt"], format='%Y-%m-%d %H:%M:00')
df_x['message_date'] = pd.to_datetime(df_x["message_date"], format='%Y-%m-%d')

In [6]:
df_x['text'] = df_x['text'].fillna('')
df_x['textlower'] = df_x['textlower'].fillna('')
df_x['image'] = df_x['image'].fillna('')
df_x['video_thumb'] = df_x['video_thumb'].fillna('')
df_x['emojis'] = df_x['emojis'].fillna('')
df_x['video_length'] = df_x['video_length'].fillna('')
df_x['audio_length'] = df_x['audio_length'].fillna('')

### Text similarity imports/functions

In [7]:
import string, re, unidecode
from wordcloud import WordCloud
from sklearn.decomposition import LatentDirichletAllocation as LDA

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer

In [8]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans(string.punctuation, len(string.punctuation) * " ")
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [9]:
def dummy(x):
    return x

vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

# Descriptive statistics

In [10]:
print("Text: %s" % df_x[df_x['text'] != ''].shape[0])
print((df_x['text'] != '').mean())

print("Image: %s" % df_x[df_x['image'] != ''].shape[0])
print((df_x['image'] != '').mean())

print("Audio : %s" % df_x[df_x['audio_length'] != ''].shape[0])
print((df_x['audio_length'] != '').mean())

print("Video: %s" % df_x[df_x['video_thumb'] != ''].shape[0])
print((df_x['video_thumb'] != '').mean())

print("Emojis: %s" % df_x[df_x['emojis'] != ''].shape[0])
print((df_x['emojis'] != '').mean())

print(df_x.shape[0])

Text: 101414
0.5908736031322466
Image: 38455
0.22405234394117715
Audio : 8918
0.05195940198329002
Video: 15596
0.0908677767808243
Emojis: 28886
0.16829998718202688
171634


# Text

### Text length

In [11]:
word_counts = df_x[df_x['textlower'] != '']['textlower'].str.count(' ') + 1

In [12]:
df_x.loc[df_x['textlower'] != '', 'word_count'] = df_x[df_x['textlower'] != '']['textlower'].str.count(' ') + 1

In [13]:
plt.figure(figsize = (9, 5))
word_counts.hist(range = (0, 100), bins = 100)
plt.xlabel("Word Count")
plt.ylabel("# Messages")
plt.title("Text Messages")
plt.savefig('images/ch-messages/word_count.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [14]:
print(np.median(word_counts))
print(np.quantile(word_counts, .25))
print(np.quantile(word_counts, .75))
print(np.mean(word_counts == 1))
print(np.mean(word_counts < 20))
print(np.mean(word_counts > 100))
print(np.mean(word_counts > 500))

6.0
3.0
16.0
0.1413118504348512
0.7984006152996628
0.051482043899264404
0.011448123533239986


#### Group/user word count

In [15]:
group_word_count = df_x[df_x['word_count'] > 0].groupby('uid')['word_count'].mean()
user_word_count = df_x[df_x['word_count'] > 0].groupby('tel')['word_count'].mean()

In [16]:
group_word_count.hist(bins = 100)
plt.xlabel("Avg. Word Count")
plt.ylabel("# of Groups")
plt.savefig('images/ch-messages/group_word_count.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [17]:
user_word_count.hist(range = (100, 400), bins = 30)
plt.xlabel("Avg. Word Count")
plt.ylabel("# of Users")
plt.savefig('images/ch-messages/user_word_count.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

#### Char counts

In [18]:
char_counts = df_x[df_x['textlower'] != '']['text'].str.len()

In [19]:
plt.figure(figsize = (9, 5))
char_counts.hist(range = (0, 200), bins = 100)
plt.xlabel("Character Count")
plt.ylabel("# Messages")
plt.title("Text Messages")
plt.savefig('images/ch-messages/char_count.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [20]:
print(np.mean(char_counts))
print(np.median(char_counts))
print(np.quantile(char_counts, .25))
print(np.quantile(char_counts, .75))
print(np.mean(char_counts <= 10))
print(np.mean(char_counts > 522))

153.83602855621513
34.0
15.0
93.0
0.17002583469737906
0.056905358234563275


### LDA / WordCloud

In [21]:
stemmer = SnowballStemmer('spanish')
remove_punc = str.maketrans('', '', string.punctuation)
stopwords_ascii = [unidecode.unidecode(w) for w in stopwords.words('spanish')]

def tokenize(s):
    s = unidecode.unidecode(s)
    s = s.translate(remove_punc)
    tokens = nltk.word_tokenize(s)
    filtered = [w for w in tokens if w not in stopwords_ascii]
    return [stemmer.stem(w) for w in filtered]

In [22]:
def dummy(x):
    return x

count_vectorizer = CountVectorizer(analyzer='word', tokenizer=dummy, preprocessor=dummy,
                            token_pattern=None)

In [23]:
df_t = df_x[df_x['textlower'].str.len() > 0]
print(df_t.shape)

df_t.loc[:,'token'] = df_t.loc[:,'textlower'].apply(tokenize)

(101414, 27)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [24]:
df_t['token_string'] = df_t['token'].apply(lambda x: ','.join(x))
all_texts = ','.join(list(df_t['token_string'].values))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [25]:
wordcloud = WordCloud(background_color="white", max_words=5000, contour_width=3,
                      contour_color='steelblue', width=1400, height=800).generate(all_texts)
plt.figure(figsize = (14, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig('images/ch-messages/wordcloud.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [26]:
X = count_vectorizer.fit_transform(df_t['token'])

In [27]:
# Tweak the two parameters below
number_topics = 10
number_words = 10

# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=10, n_jobs=-1,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [28]:
# Helper function (sourced from somewhere online)
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic %d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Print the topics found by the LDA model
print_topics(lda, count_vectorizer, number_words)


Topic 0:
bs hol grup venezuel whatsapp tas pes hoy pag 1

Topic 1:
dios senor dia amen vid mund amor mand vide cre

Topic 2:
man virus 591 mil pued clar pes sal seman dos

Topic 3:
graci pas bien ok buen dia feliz cambi dias grup

Topic 4:
coronavirus venezuel cas fuent inform carac nacional covid19 pais servici

Topic 5:
venezuel madur gua venezolan pais eeuu nacional gobiern president regim

Topic 6:
grup jajaj fals verd vide envi asi notici fot informacion

Topic 7:
experient trabaj am jajajaj pm vid mes envi interes priv

Topic 8:
coronavirus cas ultim noti covid19 chin hor nuev confirm pais

Topic 9:
q buen hac pued pas dias sol sab amig gent


# Audio/video

In [29]:
def get_sec(t):
    if t.count(':') == 1:
        m, s = t.split(':')
        h = 0
    else:
        h, m, s = t.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

In [30]:
audio_lengths_sec = df_x[df_x['audio_length'] != '']['audio_length'].apply(get_sec)

In [31]:
audio_lengths_sec.hist(bins = 100, range = (0, 400))
plt.xlabel("Length (seconds)")
plt.ylabel("# Messages")
plt.title("Audio Messages")
plt.savefig('images/ch-messages/audio_length.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [32]:
print(np.mean(audio_lengths_sec))
print(np.median(audio_lengths_sec))
print(np.quantile(audio_lengths_sec, .25))
print(np.quantile(audio_lengths_sec, .75))
print()
print(np.mean(audio_lengths_sec >= 30))
print(np.mean(audio_lengths_sec >= 60))
print(np.mean(audio_lengths_sec >= 300))
print()
print(np.mean((audio_lengths_sec >= 100) & (audio_lengths_sec < 200)))
print(np.mean((audio_lengths_sec >= 200) & (audio_lengths_sec < 300)))

121.37844808252972
36.0
9.0
213.0

0.5287059878896614
0.434626597891904
0.12076698811392689

0.11224489795918367
0.14622112581296254


In [33]:
video_lengths_sec = df_x[df_x['video_length'] != '']['video_length'].apply(get_sec)
video_lengths_sec_orig = \
    df_x[(df_x['video_length'] != '') & ~df_x['forwarded'] & ~df_x['forwarded_highly']]['video_length'].apply(get_sec)

In [34]:
video_lengths_sec.hist(bins = 100, range = (0, 400))
plt.xlabel("Length (seconds)")
plt.ylabel("# Messages")
plt.title("Video Messages")
plt.savefig('images/ch-messages/video_length.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [35]:
print(np.mean(video_lengths_sec))
print(np.median(video_lengths_sec))
print(np.quantile(video_lengths_sec, .25))
print(np.quantile(video_lengths_sec, .75))
print(np.mean(video_lengths_sec <= 10))
print(np.mean(video_lengths_sec >= 60))
print(np.mean(video_lengths_sec >= 300))
print()
print(np.mean(video_lengths_sec == 29))
print(np.mean(video_lengths_sec == 30))
print(np.mean(video_lengths_sec == 31))


96.50923313670172
33.0
29.0
113.0
0.04270325724544755
0.38028981790202615
0.0746986406770967

0.0631572197999487
0.1745319312644268
0.007245447550654014


In [36]:
print(np.mean(video_lengths_sec_orig == 29))
print(np.mean(video_lengths_sec_orig == 30))
print(np.mean(video_lengths_sec_orig == 31))

0.07395498392282958
0.205037513397642
0.0082529474812433


In [37]:
(df_x['forwarded'] | df_x['forwarded_highly']).mean()

0.15246396401645362

In [38]:
print(((df_x['video_length'] != '') & (df_x['forwarded'] | df_x['forwarded_highly'])).sum())
print((df_x['video_length'] != '').sum())

6266
15596


In [39]:
print(((df_x['audio_length'] != '') & (df_x['forwarded'] | df_x['forwarded_highly'])).sum())
print((df_x['audio_length'] != '').sum())

2349
8918


# Activity/day

In [40]:
df_groups = pd.read_csv('Data/df_groups_nb1-members-byMessaging.csv', index_col = 0)

In [41]:
df_groups['n_days'] = \
    df_x[['uid', 'message_date']].groupby('uid').last() \
    - df_x[['uid', 'message_date']].groupby('uid').first() + timedelta(days = 1)
df_groups['n_days'] = df_groups['n_days'].dt.days

In [42]:
df_groups['activity'] = df_x.groupby('uid').size() / df_groups['n_days']

In [43]:
df_groups['activity'].hist(bins = 40)
plt.xlabel("Activity (Messages / Day)")
plt.ylabel("# of Groups")
plt.savefig('images/ch-messages/group_activity.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [44]:
df_groups[df_groups['activity'] < 20]['n_days'].hist(bins = 20, alpha = 0.5, label = 'Activity $< 20$')
df_groups[df_groups['activity'] >= 20]['n_days'].hist(bins = 20, alpha = 0.5, label = 'Activity $\geq 20$')
plt.xlabel("# of Days in Group")
plt.ylabel("# of Groups")
plt.title("Is being kicked out of a group\nendogenous to its activity?")
plt.legend()
plt.savefig('images/ch-messages/group_kicked.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [45]:
ttest_ind(df_groups[df_groups['activity'] < 20]['n_days'],
             df_groups[df_groups['activity'] >= 20]['n_days'])

Ttest_indResult(statistic=-0.6314837256817464, pvalue=0.5285626296779937)

### Correlates of activity

In [46]:
for col in ['Size', 'pVZ', 'pCO', 'pUS', 'pPE', \
       'pCL', 'pEC', 'p3rdCountry', 'entropy']:
    
    print("============= %s =============" % col)
    print(scipy.stats.pearsonr(df_groups[col], df_groups['activity']))

(0.35904000062159214, 1.1437799861298817e-06)
(0.018357991680008255, 0.8099943009709414)
(-0.15346155301251313, 0.043207580714752496)
(-0.03691908709634258, 0.6286335796187533)
(0.09210378493756302, 0.22676025502625477)
(0.014403201933467984, 0.8503815223754218)
(0.0291687177446556, 0.7024095489022323)
(0.16050448581255083, 0.03437240914546392)
(0.3894736047919205, 1.0858789926028466e-07)


In [47]:
reg = smf.ols('activity ~ Size + entropy', data = df_groups).fit()
print(reg.summary())

                            OLS Regression Results                            
Dep. Variable:               activity   R-squared:                       0.213
Model:                            OLS   Adj. R-squared:                  0.203
Method:                 Least Squares   F-statistic:                     23.09
Date:                Tue, 14 Apr 2020   Prob (F-statistic):           1.33e-09
Time:                        19:43:59   Log-Likelihood:                -1074.6
No. Observations:                 174   AIC:                             2155.
Df Residuals:                     171   BIC:                             2165.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    -18.9513     14.228     -1.332      0.1

In [48]:
plt.scatter(df_groups['Size'], df_groups['activity'], alpha = 0.3)
plt.xlabel("Group Size")
plt.ylabel("Group Activity")
plt.title("All Groups")
plt.savefig('images/ch-messages/scatter_size_activity.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [51]:
plt.scatter(df_groups['entropy'], df_groups['activity'], alpha = 0.3)
plt.xlabel("Group Entropy")
plt.ylabel("Group Activity")
plt.title("All Groups")
plt.savefig('images/ch-messages/scatter_entropy_activity.png', bbox_inches = 'tight', pad_inches = 0.05)
plt.close()

In [52]:
df_groups.to_csv('Data/df_groups_nb2-messages.csv')