In [1]:
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm
import seaborn as sns
import pandas as pd
pd.set_option('display.max_rows', 500)

In [3]:
COMBINED_LANG_PATH = 'bbcm-lang-analysis_combined-all-posts-lang-annotations-20201014.csv' 
MASTER_FILE = 'all_accounts_master_file-v2.csv'
comb_lang_df = pd.read_csv(COMBINED_LANG_PATH)
master_file = pd.read_csv(MASTER_FILE)


In [4]:
len(master_file)

384

In [5]:
df = comb_lang_df.rename({'combined/createdDate':'createdDate',
                          'combined/uniqueUserId':'uniqueUserId',
                         'combined/userHandle':'userHandle',
                         'combined/text':'text'}, axis=1)

df['createdDate'] = pd.to_datetime(df['createdDate'], errors='coerce')


### Choose a time frame 

In [6]:
df_year= df[df.createdDate.dt.year==2020]
df_month = df_year[df_year.createdDate.dt.month==9]
comb_lang_df = df_month

In [7]:
print(len(comb_lang_df))

54328


In [8]:
combined_tw_fb_added_columns = comb_lang_df.set_index('uniqueUserId').join(master_file.set_index('combined/uniqueUserId'), lsuffix='_left', rsuffix='_right')


# Language Propotions

In [9]:

comb_lang_df = comb_lang_df[~comb_lang_df['language'].isna()]

In [10]:
# Get all used lanaguage tags
comb_langs = list(set(comb_lang_df['language'].fillna('EMPTY').values))

In [11]:
# Languages across posts
def lang_from_posts(df, lang_col):
    return df.value_counts(lang_col).rename_axis(lang_col).reset_index(name='tweets-post_count')
# round(((tw_lang_df.value_counts('language')/len(tw_lang_df))*100),5) if we want percentage

# Num users using each language (at least once)
def get_users_using_lang(df, languages, lang_col, user_col):
    data = []

    for lang in languages:
        per_lang_df = df[df[lang_col] == lang].copy()
        per_lang_df = per_lang_df.drop_duplicates(user_col)
        total_num_users = len(df.drop_duplicates(user_col))
        data.append(dict(language=lang, used_by_percent=(round(len(per_lang_df)/total_num_users*100,3))))

    return pd.DataFrame(data).sort_values('used_by_percent', ascending=False)

In [14]:
comb_lang_per_posts = lang_from_posts(df=comb_lang_df, 
                    lang_col='language')
comb_lang_per_user = get_users_using_lang(df=comb_lang_df,
                     languages=comb_langs,
                    lang_col='language',
                    user_col='uniqueUserId')

combined_all_lang_propotion = comb_lang_per_posts.set_index('language').join(comb_lang_per_user.set_index('language'))
# the index here is the language
combined_all_lang_propotion.to_csv('lang_analysis-2020-10-22/language break down /language_summary_statistics.csv',index = False)


In [16]:
combined_all_lang_propotion.head()

Unnamed: 0_level_0,tweets-post_count,used_by_percent
language,Unnamed: 1_level_1,Unnamed: 2_level_1
en,33247,77.586
ar,5527,8.966
es,4137,14.138
zh-tw,2659,9.655
fr,2343,16.552


# User Language   Propotion

In [17]:
def top_n_to_dict(most_common, prefix='top_lang'):
    
    data = {} 
    
    for i, (lang, count) in enumerate(most_common):
        data[f'{prefix}_{i+1}'] = lang
        data[f'{prefix}_count_{i+1}'] = count
        
    return data

In [18]:
def lang_user_propotion(df, user_col,lang_col,join_col):
        user_ids = list(set(df[user_col].values))

        u_data = []

        for u_id in tqdm(user_ids):
            u_df = df[df[user_col] == u_id].copy()


            lang_counts = Counter(u_df[lang_col].values).most_common(3)
            top_langs = top_n_to_dict(lang_counts)

            row_dict = {**dict(user_id=u_id), **top_langs}
            u_data.append(row_dict)

        _df = pd.DataFrame(u_data).sort_values('top_lang_count_1', ascending=False)
        _df = _df.set_index('user_id').join(df.set_index(user_col)[join_col], how='left')

        _df = _df.reset_index().drop_duplicates(subset=[join_col])
        _df = _df.rename(columns={'index': user_col})
        _df = _df[[
            join_col,
            user_col,
            'top_lang_1',
            'top_lang_count_1',
            'top_lang_2',
            'top_lang_count_2',
            'top_lang_3',
            'top_lang_count_3',
        ]]
        return _df


In [19]:
def lang_category_propotion(df, cat_col,lang_col,join_col):
        user_ids = list(set(df[cat_col].values))

        u_data = []

        for u_id in tqdm(user_ids):
            u_df = df[df[cat_col] == u_id].copy()


            lang_counts = Counter(u_df[lang_col].values).most_common(3)
            top_langs = top_n_to_dict(lang_counts)
           

            row_dict = {**dict(user_id=u_id), **top_langs}
            u_data.append(row_dict)

        _df = pd.DataFrame(u_data).sort_values('top_lang_count_1', ascending=False)
        return _df


In [20]:
comb_lang_user_propotion = lang_user_propotion(df=comb_lang_df,
                            user_col='uniqueUserId',
                            lang_col='language',
                            join_col='userHandle')

100%|██████████| 290/290 [00:01<00:00, 272.11it/s]


In [None]:
combined_language_break_down_based_on_user = comb_lang_user_propotion.set_index('uniqueUserId').join(master_file.set_index('combined/uniqueUserId'), lsuffix='_left', rsuffix='_right')
combined_language_break_down_based_on_user = combined_language_break_down_based_on_user.fillna('N/A')
combined_language_break_down_based_on_user.to_csv('lang_analysis-2020-10-22/language break down /user_language_break_down.csv')



In [None]:

comb_lang_cat_propotion = lang_category_propotion(df=combined_tw_fb_added_columns,
                            cat_col='account_category',
                            lang_col='language',
                            join_col='account_category')


In [None]:
comb_lang_cat_propotion = comb_lang_cat_propotion.fillna('N/A')
comb_lang_cat_propotion.to_csv('lang_analysis-2020-10-22/language break down /category_language_break_down.csv',index=False)

