In [1]:
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
import seaborn as sns
import pandas as pd
pd.set_option('display.max_rows', 500)

In [1]:
# TW_LANG_PATH = 'bbcm-lang-analysis_tw-all-tweets-lang-annotations.csv' 
# FB_LANG_PATH = 'bbcm-lang-analysis_fb-all-posts-lang-annotations.csv' 
COMBINED_LANG_PATH = 'bbcm-lang-analysis_combined-all-posts-lang-annotations-20201014.csv' 


In [3]:
# tw_lang_df = pd.read_csv(TW_LANG_PATH)
# fb_lang_df = pd.read_csv(FB_LANG_PATH)
comb_lang_df = pd.read_csv(COMBINED_LANG_PATH)


# Language Propotions

In [4]:
tw_lang_df = tw_lang_df[~tw_lang_df['language'].isna()]
fb_lang_df = fb_lang_df[~fb_lang_df['language'].isna()]


In [5]:
# Get all used lanaguage tags
tw_langs = list(set(tw_lang_df['language'].fillna('EMPTY').values))
fb_langs = list(set(fb_lang_df['language'].fillna('EMPTY').values))

In [14]:
# Languages across posts
def lang_from_posts(df, lang_col):
    return df.value_counts(lang_col).rename_axis(lang_col).reset_index(name='post_count')
# round(((tw_lang_df.value_counts('language')/len(tw_lang_df))*100),5) if we want percentage

# Num users using each language (at least once)
def get_users_using_lang(df, languages, lang_col, user_col):
    data = []

    for lang in languages:
        per_lang_df = df[df[lang_col] == lang].copy()
        per_lang_df = per_lang_df.drop_duplicates(user_col)
        total_num_users = len(df.drop_duplicates(user_col))
        data.append(dict(language=lang, used_by=(round(len(per_lang_df)/total_num_users*100,3))))

    return pd.DataFrame(data).sort_values('used_by', ascending=False)

In [15]:
tw_lang_per_posts = lang_from_posts(df=tw_lang_df, 
                    lang_col='language')
fb_lang_per_posts = lang_from_posts(df=fb_lang_df, 
                    lang_col='language')

In [16]:
tw_lang_per_user = get_users_using_lang(df=tw_lang_df,
                     languages=tw_langs,
                    lang_col='language',
                    user_col='twitter.user/id')

fb_lang_per_user = get_users_using_lang(df=fb_lang_df,
                     languages=fb_langs,
                    lang_col='language',
                    user_col='crowdtangle.account/platformId')

In [17]:
combined_tw_lang_propotion = tw_lang_per_posts.set_index('language').join(tw_lang_per_user.set_index('language'))
combined_fb_lang_propotion = fb_lang_per_posts.set_index('language').join(fb_lang_per_user.set_index('language'))

In [25]:
combined_tw_lang_propotion.to_csv('combined_tw_lang_propotion.csv')
combined_fb_lang_propotion.to_csv('combined_fb_lang_propotion.csv')

# User Language   Propotion

In [20]:
def top_n_to_dict(most_common, prefix='top_lang'):
    
    data = {} 
    
    for i, (lang, count) in enumerate(most_common):
        data[f'{prefix}_{i+1}'] = lang
        data[f'{prefix}_count_{i+1}'] = count
        
    return data

In [21]:
def lang_user_propotion(df, user_col,lang_col,join_col):
        user_ids = list(set(df[user_col].values))

        u_data = []

        for u_id in tqdm(user_ids):
            u_df = df[df[user_col] == u_id].copy()

            lang_counts = Counter(u_df[lang_col].values).most_common(3)
            top_langs = top_n_to_dict(lang_counts)

            row_dict = {**dict(user_id=u_id), **top_langs}
            u_data.append(row_dict)

        _df = pd.DataFrame(u_data).sort_values('top_lang_count_1', ascending=False)
        _df = _df.set_index('user_id').join(df.set_index(user_col)[join_col], how='left')

        _df = _df.reset_index().drop_duplicates(subset=[join_col])
        _df = _df.rename(columns={'index': user_col})
        _df = _df[[
            join_col,
            user_col,
            'top_lang_1',
            'top_lang_count_1',
            'top_lang_2',
            'top_lang_count_2',
            'top_lang_3',
            'top_lang_count_3',
        ]]
        return _df


In [22]:
tw_lang_user_propotion = lang_user_propotion(df=tw_lang_df,
                            user_col='twitter.user/id',
                            lang_col='language',
                            join_col='twitter.user/screenName')


fb_lang_user_propotion = lang_user_propotion(df=fb_lang_df,
                            user_col='crowdtangle.account/platformId',
                            lang_col='language',
                            join_col='crowdtangle.account/handle')



100%|██████████| 224/224 [00:00<00:00, 816.68it/s]
100%|██████████| 78/78 [00:00<00:00, 1532.41it/s]


In [24]:
tw_lang_user_propotion.to_csv('tw_lang_user_propotion.csv')
fb_lang_user_propotion.to_csv('fb_lang_user_propotion.csv')

In [None]:
# user_ids = list(set(tw_lang_df['twitter.user/id'].values))

# u_data = []

# for u_id in tqdm(user_ids):
#     u_df = tw_lang_df[tw_lang_df['twitter.user/id'] == u_id].copy()
    
#     lang_counts = Counter(u_df['language'].values).most_common(3)
#     top_langs = top_n_to_dict(lang_counts)
    
#     row_dict = {**dict(user_id=u_id), **top_langs}
#     u_data.append(row_dict)

# _df = pd.DataFrame(u_data).sort_values('top_lang_count_1', ascending=False)
# _df = _df.set_index('user_id').join(tw_lang_df.set_index('twitter.user/id')['twitter.user/screenName'], how='left')

In [None]:
# _df = _df.reset_index().drop_duplicates(subset=['twitter.user/screenName'])
# _df = _df.rename(columns={'index': 'twitter.user/id'})
# _df = _df[[
#     'twitter.user/screenName',
#     'twitter.user/id',
#     'top_lang_1',
#     'top_lang_count_1',
#     'top_lang_2',
#     'top_lang_count_2',
#     'top_lang_3',
#     'top_lang_count_3',
# ]]

In [None]:
_df

In [None]:
sns.set(font_scale=1.5)

tw_lang_df.value_counts('language').plot(kind='bar', figsize=(12, 6), rot=90)

plt.xlabel("Language", labelpad=14)
plt.ylabel("Post Languages", labelpad=14)
plt.title("Twitter Post Language", y=1.02)

In [None]:
sns.set(font_scale=1.5)

tw_lang_df[tw_lang_df['language'] != 'en'].value_counts('language').plot(kind='bar', figsize=(12, 6), rot=90)

plt.xlabel("Language", labelpad=14)
plt.ylabel("Post Languages", labelpad=14)
plt.title("Twitter Post Language (Without EN)", y=1.02)