In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from collections import Counter

# Local imports
from utils import get_top_users, user_mask, count_top_labels

pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 20)

# Load Zeerak NAACL tweets

In [2]:
data_path = 'data/zeerak_naacl/'
fname = '{}zeerak_naacl_tweets.pkl'.format(data_path)
df = pd.read_pickle(fname)
df.loc[:5, ['text', 'label']]

Unnamed: 0,text,label
0,"@ummayman90 @wood_brwood333 For example, in Medina Mohammed said, kindly cut off the heads of these 600 Jewish men and enslave their women.",racism
1,"@ummayman90 Again, your entire concept of god corresponds to a tyrannical earthly egomaniac because you are simple and stupid. #Islam",racism
2,"@anjemchoudary Your prophet was a rapist, murderer, pedophile, caravan robber, slave trader, bigot and sexist. God would never use the scum.",racism
3,"RT @DilanaKurdi: Yazidi children who are taken from their parents, forcibly converted to Islam! #Sinjar #Shengal http://t.co/wwzplHld7f",racism
4,@anjemchoudary Accepting the idea that Allah obsesses over how we take a dump shows just how utterly insane this inbred religion is.,racism
5,"These girls are the equivalent of the irritating Asian girls a couple years ago. Well done, 7. #MKR",racism


# Poking Around

In [3]:
n_tweets = len(df)
attrs = {'racist': 'racism',
         'sexist': 'sexism',
         'neither': 'none'}
colors = {'racist': 'red',
          'sexist': 'cyan',
          'neither': 'grey'}

In [4]:
attr_percs = {}
for name, attr in attrs.items():
    n_attr = sum(df['label']==attr)
    perc = 100. * n_attr / n_tweets
    print("{0:5} of {1:5} tweets are {2:7} ({3:5.2f}%)".format(
        n_attr, n_tweets, name, perc))
    attr_percs[name] = perc

 1918 of 15813 tweets are racist  (12.13%)
 3042 of 15813 tweets are sexist  (19.24%)
10853 of 15813 tweets are neither (68.63%)


### Looking at user distributions

In [5]:
# The top 4 users account for a huge amount of the data
topn=4
n_top_posts = sum([v for user, v in get_top_users(df, topn)])
perc = 100. * n_top_posts / n_tweets
print("{0:.2f}% of {1} tweets are from the top {2} users\n".format(
    perc, n_tweets, topn))

topn=5
topn_users = get_top_users(df, topn)
for user, n in topn_users:
    perc = 100. * n / n_tweets
    print("User {0:^14} has {1:4} tweets ({2:5.2f}% of total)".format(
        user, n, perc))

72.85% of 15813 tweets are from the top 4 users

User   VileIslam    has 4423 tweets (27.97% of total)
User randileeharper has 3813 tweets (24.11% of total)
User YesYoureSexist has 2255 tweets (14.26% of total)
User     MT8_9      has 1029 tweets ( 6.51% of total)
User     SKR_16     has   45 tweets ( 0.28% of total)


In [6]:
# Make new df with users and their number of posts
idx = pd.Index([], name='user')
cols = pd.Index(['total', 'racist', 'sexist', 'neither'], name='tweet_count')
users_df = pd.DataFrame(index=idx, columns=cols)

for user in set(df['user_screen_name']):
    tmp_df = user_mask(df, user)
    label_counts = Counter(tmp_df['label'])
    user_details = {'total': len(tmp_df),
                    'racist': label_counts['racism'],
                    'sexist': label_counts['sexism'],
                    'neither': label_counts['none']}
    users_df.loc[user] = user_details

In [7]:
users_df.to_pickle('{}user_post_distrs.pkl'.format(data_path))

In [8]:
count_top_labels(users_df, 'racist', 1)
count_top_labels(users_df, 'sexist', 2)
count_top_labels(users_df, 'neither', 3)

1904 of  1918 racist  tweets are from the top  1 users (99.27%)
2244 of  3042 sexist  tweets are from the top  2 users (73.77%)
7218 of 10853 neither tweets are from the top  3 users (66.51%)
