Loading the necessary packages

In [None]:
import preprocessor as pp
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import numpy as np

import string
import spacy
import ast
import sys
import re

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from collections import Counter
from itertools import islice
from scipy.stats import sem

nlp = spacy.load("en_core_web_sm")

Loading users info

In [None]:
ui = pd.read_excel('user_info.xlsx')
print(ui)

Loading the scraped data (from past 6 months) and show the numbers of posts (=tweets+quote_tweets+replies_to_self) for each user

In [None]:
df = {}
for username in ui['username']:
    df[username] = pd.read_excel('data/' + username + '.xlsx')
    print(username,df[username].shape[0],'posts')
    # keep only up to 100 (randomly sampled) posts per user
    df[username] = df[username].sample(frac=1,random_state=0)
    df[username] = df[username].head(100)
    # some basic text processing
    df[username]['content'] = df[username]['content'].str.lower()
    df[username]['content'] = df[username]['content'].str.replace('\n',' ')
    df[username]['content'] = df[username]['content'].str.replace('&amp;',' ')
    df[username]['content'] = df[username]['content'].str.replace('&lt;',' ')
    df[username]['content'] = df[username]['content'].str.replace('&gt;',' ')

Various functions to process the data

In [None]:
# this is what we define as "punctuation" here, i.e., symbols that need to be padded with whitespaces
p = string.punctuation
# do not remove hyphens 
p = p.replace('-','')
# do not remove apostrophes
p = p.replace('\'','')
punctuation_re = '([' + p + '])'

def process_text(text,keep_punctuation=False,lemmatize=True):
    # remove irrelevant tokens
    pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
    text = pp.clean(text)
    # remove "punctuation"
    if not keep_punctuation:
        text = re.sub(punctuation_re,' ',text)
    # keep it but add spaces around the symbols
    else:
        text = re.sub(punctuation_re, r' \1 ',text)
    # lemmatize
    if lemmatize:
        doc = nlp(text)
        sent = [word.lemma_ for word in doc]
        text = " ".join(sent)
    return text

def count_ngrams(text,n=1):
    text = text.split()
    if n==1: ngrams = Counter(text)
    else: ngrams = Counter(zip(*[islice(text,i,None) for i in range(n)]))
    return ngrams

def get_top_n_grams(text,top=10,ngram=1):
    ngrams = count_ngrams(text,ngram)
    sorted_ngrams = ngrams.most_common()
    total = sum(c for ngram, c in sorted_ngrams)
    if ngram>1: sorted_ngrams = [(' '.join(ng[0]),ng[1],100*ng[1]/total) for ng in sorted_ngrams]
    else: sorted_ngrams = [(ng[0],ng[1],100*ng[1]/total) for ng in sorted_ngrams]
    return sorted_ngrams[:top]

def get_keyness(corpus1,corpus2,n1,n2,ngram=1):
    # corpus1 is a list of N strings, each string representing a concatenation of the tweets from an author 
    # belonging to the group this is the corpus of
    # create a list of all the words from authors in corpus1
    words = ' '.join(corpus1)
    # technically, these are the unigrams
    words = words.split()
    # transform them into ngrams if needed
    if ngram>1:
        words = list(zip(*[words[i:] for i in range(ngram)]))
    keyness = {}
    # the list of unique ngrams (to save time when searching the texts contain them)
    unique_words = list(set(words))
    
    # compute ngram frequencies in corpus1 and corpus2
    word_freq_1 = []
    for author in corpus1:
        author_words = author.split()
        if ngram>1: author_words = list(zip(*[author_words[i:] for i in range(ngram)]))
        word_freq_1.append(Counter(author_words))      
    word_freq_2 = []
    for author in corpus2:
        author_words = author.split()
        if ngram>1: author_words = list(zip(*[author_words[i:] for i in range(ngram)]))
        freq = Counter(author_words)
        word_freq_2.append(freq)    
    
    for word in unique_words:
        o1 = 0
        f1 = 0
        for freq in word_freq_1:
            o1 = o1 + int(freq[word]>0)
            f1 = f1 + freq[word]
        o2 = 0
        for freq in word_freq_2:
            o2 = o2 + int(freq[word]>0)
        # check number of tweets instead of number of authors for which it appears?
        e1 = n1*(o1+o2)/(n1+n2)
        e2 = n2*(o1+o2)/(n1+n2)
        p1 = o1*np.log(o1/e1) if o1>0 else 0
        p2 = o2*np.log(o2/e2) if o2>0 else 0
        k = 2*(p1+p2)
        #if word == 'thanks': print(o1,o2,np.log(o1/e1),np.log(o2/e2),n1,n2,e1,e2,p1,p2)
        keyness[word] = (k,f1)
    return keyness

Compute keyness of words for each group

In [None]:
corpus1 = {}
n1 = {}
ng = 1
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

for group in ui[grouping].unique():
    corpus1[group] = []
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        text = df[username].content.str.cat(sep=' ')
        #text = [process_text(t,keep_punctuation=False,lemmatize=False) for t in text]
        text = process_text(text,keep_punctuation=False,lemmatize=False)
        #corpus1[group] = corpus1[group] + text
        corpus1[group].append(text)
    n1[group]=len(usernames)    
        
keyness = {}
for discipline in corpus1.keys():
    n2 = 0
    corpus2 = []
    for other_discipline in corpus1.keys():
        if discipline==other_discipline: continue
        corpus2 = corpus2 + corpus1[other_discipline]
        n2 = n2 + n1[other_discipline]
    keyness[discipline] = get_keyness(corpus1[discipline],corpus2,n1[discipline],n2,ngram=ng)
    
for group in ui[grouping].unique():
    print("==== " + group + " ====")
    dfk = pd.DataFrame(keyness[group].items(),columns=['word', 'keyness'])
    dfk = dfk.sort_values(by=['keyness'],ascending=False)
    print(dfk.head(10).to_string(index=False),'\n')
    #dfk.to_excel('results/keyness_' + str(ng) + '-gram_' + group + '.xlsx', index=None)

Merge all content (text without urls, emojis, mentions, smileys, hashtags) of all users within one discipline and export it to a txt file + export all tweets of each users to a txt file

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'pureApplied'
for group in ui[grouping].unique():
    tot_text = ''
    for username in ui[ui[grouping]==group].username.tolist():
        pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
        user_text = pp.clean(df[username].content.str.cat(sep=' '))
        tot_text = tot_text + user_text
        txt_file = open('results/' + username + '.txt','w')
        txt_file.write(user_text)
        txt_file.close()
    txt_file = open('results/' + group + '.txt','w')
    txt_file.write(tot_text)
    txt_file.close()

Compute and display unigrams/bigrams/trigrams for each group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'pureApplied'

for group in ui[grouping].unique():
    text = ''
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        text = text + df[username].content.str.cat(sep=' ')
    processed_text = process_text(text,keep_punctuation=False,lemmatize=True)
    unigrams = get_top_n_grams(processed_text,50,1)
    processed_text = process_text(text,keep_punctuation=True,lemmatize=True)
    bigrams = get_top_n_grams(processed_text,50,2)
    trigrams = get_top_n_grams(processed_text,50,3)
    stats = pd.DataFrame(unigrams,columns = ['unigram','#uni','%uni'])
    stats['bigrams'],stats['#bi'],stats['%bi'] = zip(*bigrams)
    stats['trigrams'],stats['#tri'],stats['%tri'] = zip(*trigrams)
    print('==== \033[1m' + group.upper() + '\033[0m ====')
    with pd.option_context('display.float_format','{:0.2f}'.format,'expand_frame_repr', False):
        print(stats)
    print('\n\n')
    #stats.to_excel('results/ngrams_' + group + '.xlsx')

Compute the engagement rate for each group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

for group in ui[grouping].unique():
    engagements = []
    tot_followers = 0
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        num_followers = df[username]['user.followersCount'].values[0]
        tot_followers = tot_followers + num_followers
        avg_engagement = df[username]['engagementRate'].mean()
        engagements.append(avg_engagement)
        print(username + ' {:.2f}'.format(avg_engagement) + '% avg eng and ' + str(num_followers) + ' followers')
        
    print('\033[1m' + group.upper() + ' = {:.2f}'.format(np.mean(engagements)) + 
          '% ± {:.2f}'.format(sem(engagements)) + '\033[0m')
    print('Avg num of followers: {:.1f}'.format(tot_followers/len(usernames)))
    print('====================')

Average length of tweets (in characters) per user and group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

for group in ui[grouping].unique():
    tot_length_discipline = []
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
        tot_length_user = 0
        for tweet in df[username]['content']:
            cleaned_tweet = pp.clean(tweet)
            tot_length_user = tot_length_user + len(cleaned_tweet)
        print(username,'{:.1f}'.format(tot_length_user/len(df[username]['content'])))
        tot_length_discipline.append(tot_length_user/len(df[username]['content']))
    print('\033[1m' + group.upper(),'{:.1f}'.format(np.mean(tot_length_discipline)),
          ' ± {:.2f}'.format(sem(tot_length_discipline)) + '\033[0m\n')

How many single tweets/threads/quote per user and group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'pureApplied'

for group in ui[grouping].unique():
    num_quote_tweets_discipline = []
    num_single_tweets_discipline = []
    num_threads_discipline = []
    avg_threads_discipline = []
    df_results = pd.DataFrame(columns = ['-','#single','#thread','avg len thread','#quotes'])
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        num_quote_tweets_user = sum(df[username]['quotedTweet.url']!='None')
        num_quote_tweets_discipline.append(num_quote_tweets_user)
        counter = df[username]['conversationId'].value_counts()
        num_single_tweets_user = sum(counter==1)
        num_single_tweets_discipline.append(num_single_tweets_user)
        num_threads_user = sum(counter>1)
        num_threads_discipline.append(num_threads_user)
        avg_threads_user = sum(counter[counter>1])/len(counter[counter>1]) if sum(counter>1) else 0
        avg_threads_discipline.append(avg_threads_user)
        row = {'-':username,'#single':num_single_tweets_user,'#thread':num_threads_user,
               'avg len thread':'{:.1f}'.format(avg_threads_user),'#quotes':num_quote_tweets_user}
        df_results = df_results.append(row, ignore_index = True)    
    row = {'-':group.upper(),
           '#single':'{:.1f}'.format(sum(num_single_tweets_discipline)/len(num_single_tweets_discipline)) +
           '± {:.1f}'.format(sem(num_single_tweets_discipline)),
           '#thread':'{:.1f}'.format(sum(num_threads_discipline)/len(num_threads_discipline)) +
           '± {:.1f}'.format(sem(num_threads_discipline)),
           'avg len thread':'{:.1f}'.format(sum(avg_threads_discipline)/len(avg_threads_discipline)) +
           '± {:.1f}'.format(sem(avg_threads_discipline)),
           '#quotes':'{:.1f}'.format(sum(num_quote_tweets_discipline)/len(num_quote_tweets_discipline)) +
           '± {:.1f}'.format(sem(num_quote_tweets_discipline))}
    df_results = df_results.append(row, ignore_index = True)
    #df_results.to_excel('results/tweet-type_' + group + '.xlsx',header=True,index=None)
    print(df_results,'\n')

How many emojis/hashtags/urls/photos/videos/gifs per user and group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

for group in ui[grouping].unique():
    emojis_discipline = []
    hashtags_discipline = []
    mentions_discipline = []
    urls_discipline = []
    photos_discipline = []
    videos_discipline = []
    gifs_discipline = []
    
    df_results = pd.DataFrame(columns = ['-','#emojis','#hashtags','#urls','#mentions','#photos','#videos','#gifs'])
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        emojis_user = 0
        hashtags_user = 0
        urls_user = 0
        mentions_user = 0
        photos_user = 0
        videos_user = 0
        gifs_user = 0
        
        for tweet in df[username]['content']:
            tokens = pp.tokenize(tweet)
            emojis_user = emojis_user + tokens.count('$EMOJI')
            hashtags_user = hashtags_user + tokens.count('$HASHTAG')
            mentions_user = mentions_user + tokens.count('$MENTION')
        emojis_discipline.append(emojis_user)
        hashtags_discipline.append(hashtags_user)
        mentions_discipline.append(mentions_user)
        
        for ols in df[username]['outlinks']:
            if ols=='None': continue
            not_twitter = []
            if 'twitter.com' in ols: continue
            ols = ast.literal_eval(ols)
            urls_user = urls_user + len(ols)
        urls_discipline.append(urls_user)
        
        for medias in df[username]['media']:
            if medias=='None': continue
            medias = ast.literal_eval(medias)
            for media in medias:
                photos_user = photos_user + 1 if 'snscrape.modules.twitter.Photo' in media['_type'] else photos_user
                videos_user = videos_user + 1 if 'snscrape.modules.twitter.Video' in media['_type'] else videos_user
                gifs_user = gifs_user + 1 if 'snscrape.modules.twitter.Gif' in media['_type'] else gifs_user
        photos_discipline.append(photos_user)
        videos_discipline.append(videos_user)
        gifs_discipline.append(gifs_user)
        
        row = {'-':username,'#emojis':emojis_user,'#hashtags':hashtags_user,
               '#urls':urls_user,'#mentions':mentions_user,'#photos':photos_user,'#videos':videos_user,
               '#gifs':gifs_user}
        df_results = df_results.append(row, ignore_index = True)
    row = {'-':group.upper(),
           '#emojis':'{:.1f}'.format(sum(emojis_discipline)/len(emojis_discipline)) +
           '± {:.1f}'.format(sem(emojis_discipline)),
           '#hashtags':'{:.1f}'.format(sum(hashtags_discipline)/len(hashtags_discipline)) +
           '± {:.1f}'.format(sem(hashtags_discipline)),
           '#urls':'{:.1f}'.format(sum(urls_discipline)/len(urls_discipline)) +
           '± {:.1f}'.format(sem(urls_discipline)),
           '#mentions':'{:.1f}'.format(sum(mentions_discipline)/len(mentions_discipline)) +
           '± {:.1f}'.format(sem(mentions_discipline)),
           '#photos':'{:.1f}'.format(sum(photos_discipline)/len(photos_discipline)) +
           '± {:.1f}'.format(sem(photos_discipline)),
           '#videos':'{:.1f}'.format(sum(videos_discipline)/len(videos_discipline)) +
           '± {:.1f}'.format(sem(videos_discipline)),
           '#gifs':'{:.1f}'.format(sum(gifs_discipline)/len(gifs_discipline)) +
           '± {:.1f}'.format(sem(gifs_discipline))}
    df_results = df_results.append(row, ignore_index = True)
    #df_results.to_excel('results/modalities_' + group + '.xlsx',header=True,index=None)
    print(df_results,'\n')

Measure the frequency (per 1000 words) of engagement types for each group

In [None]:
engagement_types = ['boosters','hedges','reader','self_mention','directives']
for engagement_type in engagement_types:
    print("======" + engagement_type + "======")
    with open('engagement_search_items/' + engagement_type + '.txt') as file:
        engagement_items = [line.rstrip('\n') for line in file]

    #disciplinaryGroup,softHard,pureApplied
    grouping = 'disciplinaryGroup'
    for group in ui[grouping].unique():
        tot_text = ''
        for username in ui[ui[grouping]==group].username.tolist():
            pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
            user_text = pp.clean(df[username].content.str.cat(sep=' '))
            tot_text = tot_text + ' ' + user_text
        
        tot_text = tot_text.split()
        occurences = []
        for word in engagement_items:
            count = tot_text.count(word)
            occurences.append(count)
        print(group,sum(occurences)/len(tot_text)*1000)

Measure the frequency (per 1000 words) of engagement types for each user and the averages per group

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

engagement_types = ['boosters','hedges','reader','self_mention','directives']
engagement_items = {key:[] for key in engagement_types} 
for engagement_type in engagement_types:
    with open('engagement_search_items/' + engagement_type + '.txt') as file:
        engagement_items[engagement_type] = [line.rstrip('\n') for line in file]
                        
for group in ui[grouping].unique():
    discipline_count =  {key:[] for key in engagement_types}
    df_results = pd.DataFrame(columns = ['-','#boosters','#hedges','#reader','#self_mention','#directives'])
    usernames = ui[ui[grouping]==group].username.tolist()
    for username in usernames:
        user_count = dict.fromkeys(engagement_types, 0)        
        pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
        user_text = pp.clean(df[username].content.str.cat(sep=' '))
        user_text = user_text.split()
        
        for engagement_type in engagement_types:
            for word in engagement_items[engagement_type]:
                user_count[engagement_type] = user_count[engagement_type] + user_text.count(word)
            user_count[engagement_type] = 1000*user_count[engagement_type]/len(user_text)
            discipline_count[engagement_type].append(user_count[engagement_type])
        
        row = {'-':username,'#boosters':user_count['boosters'],'#hedges':user_count['hedges'],
               '#reader':user_count['reader'],'#self_mention':user_count['self_mention'],'#directives':user_count['directives']}
        df_results = df_results.append(row, ignore_index = True)
    
    row = {'-':group.upper(),
               '#boosters':'{:.1f}'.format(sum(discipline_count['boosters'])/len(discipline_count['boosters'])) +
               '± {:.1f}'.format(sem(discipline_count['boosters'])),
               '#hedges':'{:.1f}'.format(sum(discipline_count['hedges'])/len(discipline_count['hedges'])) +
               '± {:.1f}'.format(sem(discipline_count['hedges'])),
               '#reader':'{:.1f}'.format(sum(discipline_count['reader'])/len(discipline_count['reader'])) +
               '± {:.1f}'.format(sem(discipline_count['reader'])),
               '#self_mention':'{:.1f}'.format(sum(discipline_count['self_mention'])/len(discipline_count['self_mention'])) +
               '± {:.1f}'.format(sem(discipline_count['self_mention'])),
               '#directives':'{:.1f}'.format(sum(discipline_count['directives'])/len(discipline_count['directives'])) +
               '± {:.1f}'.format(sem(discipline_count['directives']))}
    df_results = df_results.append(row, ignore_index = True)
    #df_results.to_excel('results/engagement_' + group + '.xlsx',header=True,index=None)
    print(df_results,'\n')
            

How many 'words' per user (without counting url, emoji, mentions, smileys, hashtags)

In [None]:
#disciplinaryGroup,softHard,pureApplied
grouping = 'disciplinaryGroup'

tot_all = 0
for group in ui[grouping].unique():
    usernames = ui[ui[grouping]==group].username.tolist()
    tot = 0
    for username in usernames:
        user_count = dict.fromkeys(engagement_types, 0)        
        pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
        user_text = pp.clean(df[username].content.str.cat(sep=' '))
        user_text = process_text(user_text,keep_punctuation=True,lemmatize=False)
        user_text = user_text.split()
        tot = tot + len(user_text)
    tot_all = tot_all + tot
    print(group,tot)
print('ALL =', tot)

How many question (marks) for each group

In [None]:
#disciplinaryGroup,softHard,pureApplied
s = "!"
grouping = 'disciplinaryGroup'
for group in ui[grouping].unique():
    tot_text = ''
    tokens_group = []
    for username in ui[ui[grouping]==group].username.tolist():
        pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
        user_text = pp.clean(df[username].content.str.cat(sep=' '))
        tokens_group = tokens_group + user_text.split()
        tot_text = tot_text + user_text
        print(username,str(round(user_text.count(s)/len(user_text.split())*1000,2)))
    print(group,str(round(tot_text.count(s)/len(tokens_group)*1000,2)))
    print("========")

Average sentiment (vader-compound) for each group

In [None]:
analyzer = SentimentIntensityAnalyzer()

#disciplinaryGroup,softHard,pureApplied
grouping = 'pureApplied'
for group in ui[grouping].unique():
    usernames = ui[ui[grouping]==group].username.tolist()
    compounds = [] 
    pp.set_options(pp.OPT.URL, pp.OPT.EMOJI, pp.OPT.MENTION, pp.OPT.SMILEY, pp.OPT.HASHTAG)
    for username in usernames:
        for tweet in df[username]['content']:
            cleaned_tweet = pp.clean(tweet)
            vs = analyzer.polarity_scores(cleaned_tweet)
            compounds.append(vs['compound'])
    mean = np.mean(compounds)
    print(group + " " + str(round(mean,4)) + "±" + str(round(sem(compounds),4)))

(Some unfinished code that tests the measurement of p-values)

In [None]:
from scipy.stats import ttest_ind
from scipy.stats import t as ttable

a = [38,16,0,2,21,51,66,19,5,7]
b = [44,85,62,3,11,0,0,22,9,2]
c = [3,0,0,8,15,0,1,0,20,1]

sea = sem(a)
seb = sem(b)
sec = sem(c)

print(sea,seb,sec)


result=ttest_ind(b,c,equal_var=True)
if result.pvalue <= 0.02:
    print('+++')
elif result.pvalue > 0.1:
    print('+')
else:
    print('++')
print(result.pvalue)