In [29]:
FILENAME = 'tweet_sample_20190522c.json'
DATAPATH = '../data/'
XFERPATH = '~/downloads'  # To transfer files to Google Colab

In [2]:
import json
import pandas as pd
import gender_guesser.detector as gender

In [3]:
# From https://gist.github.com/timothyrenner/dd487b9fd8081530509c

#Gets the text, sans links, hashtags, mentions, media, and symbols.
def get_text_cleaned(tweet):
    text = tweet['text']
    
    slices = []
    #Strip out the urls.
    if 'urls' in tweet['entities']:
        for url in tweet['entities']['urls']:
            slices += [{'start': url['indices'][0], 'stop': url['indices'][1]}]
    
    #Strip out the hashtags.
    if 'hashtags' in tweet['entities']:
        for tag in tweet['entities']['hashtags']:
            slices += [{'start': tag['indices'][0], 'stop': tag['indices'][1]}]
    
    #Strip out the user mentions.
    if 'user_mentions' in tweet['entities']:
        for men in tweet['entities']['user_mentions']:
            slices += [{'start': men['indices'][0], 'stop': men['indices'][1]}]
    
    #Strip out the media.
    if 'media' in tweet['entities']:
        for med in tweet['entities']['media']:
            slices += [{'start': med['indices'][0], 'stop': med['indices'][1]}]
    
    #Strip out the symbols.
    if 'symbols' in tweet['entities']:
        for sym in tweet['entities']['symbols']:
            slices += [{'start': sym['indices'][0], 'stop': sym['indices'][1]}]
    
    # Sort the slices from highest start to lowest.
    slices = sorted(slices, key=lambda x: -x['start'])
    
    #No offsets, since we're sorted from highest to lowest.
    for s in slices:
        text = text[:s['start']] + text[s['stop']:]
        
    return text

In [4]:
names = []
texts = []
userids = []
with open(DATAPATH+FILENAME) as twitfile:
    for line in twitfile:
        r = json.loads(line)
        names.append(r['user']['name'])
        userids.append(r['user']['id'])
        texts.append(get_text_cleaned(r))

In [5]:
len(names), len(texts), len(userids)

(3054, 3054, 3054)

In [17]:
df = pd.DataFrame({'name':names, 'text':texts},index=userids)
df = df[~df.index.duplicated(keep='first')]
df.head(10)

Unnamed: 0,name,text
1125464470074142720,Jordan 🐝,We stan a flawless queen
1045196569056669704,Amanda ❤💙,Can they please get LeBron to travel back the...
774717684009369600,Yvonne Conte,So what happens when a Republican elected Pre...
27596224,Steve McMillan,He is simply parroting something he read whi...
1094628188335026176,Joe Murphy MAGA,…
334381911,Sunshine,No get rid of Mr Glass aka G. Stanton
334565594,Keith Lemon,Good job ya din’t blink!
17491245,Dina,"Well, I think that was ONE of the plots. G..."
43751161,Zeke D,i don’t understand tik tok
360577036,Elizabeth Booker,Ive just sat and watched them all in one go. R...


In [18]:
df.tail(10)

Unnamed: 0,name,text
4173790121,Ron Fulton,The only thing McConnell is bringing to the fl...
1216262840,"Matt Joass, CFA",What's your estimate of the real number as a...
441839956,Chloe Smith,"I don't know about you, but I'm feeling 22 ✌✌ ..."
1341184718,Noa,My life would be so much easier if my work spa...
21704373,Josh Goldman,"So, liked my tweet about asking about my invi..."
363202108,Yusuke Urameshi,She definitely needs to be in somebody's comme...
30445664,Sarah Cowan,yes. that's when other people should worry an...
1045294418184556544,Juan the Kid 🌃,Marvin’s room
558899882,Heather,Oh! TY.
44170399,Tim,I agree


In [19]:
texts[3]

'  He is simply parroting something he read while on the toilet. Where most of his tweets are penned.'

In [20]:
d = gender.Detector()

def is_male(name):
    return(d.get_gender(name.split()[0])=='male')

def is_female(name):
    return(d.get_gender(name.split()[0])=='female')

In [21]:
df_males = df[df.name.apply(is_male)]
print(len(df_males))
df_males.head()

1816


Unnamed: 0,name,text
1125464470074142720,Jordan 🐝,We stan a flawless queen
27596224,Steve McMillan,He is simply parroting something he read whi...
1094628188335026176,Joe Murphy MAGA,…
334565594,Keith Lemon,Good job ya din’t blink!
43751161,Zeke D,i don’t understand tik tok


In [22]:
df_males[df_males.name.duplicated(keep=False)].sort_values('name').head(10)

Unnamed: 0,name,text
1633856298,Adam,Wow look at my pure and wholesome sons
1630960316,Adam,It seems reasonable.
38929780,Alejandro,This song always hits
1129632855708131328,Alejandro,Down 💯🙌🏽🤙🏽
1073158514716147713,Alex,I just seen my 15 year old cousin put my 10 ye...
940692218096087047,Alex,"Legit man, we should play on d2! Oh wait.."
63766261,Alex,I thought she wanted to break the wheel...sou...
3386161936,Alex,Omg before it’s too late happy birthday queen
1095159331920715776,Alex,I’m fuckin cooler than a polar bears toe nails
17925470,Alex,So pleased to have my print framed and ready ...


In [24]:
df_females = df[df.name.apply(is_female)]
print(len(df_females))
df_females.head()

1189


Unnamed: 0,name,text
1045196569056669704,Amanda ❤💙,Can they please get LeBron to travel back the...
774717684009369600,Yvonne Conte,So what happens when a Republican elected Pre...
334381911,Sunshine,No get rid of Mr Glass aka G. Stanton
17491245,Dina,"Well, I think that was ONE of the plots. G..."
360577036,Elizabeth Booker,Ive just sat and watched them all in one go. R...


In [25]:
df_females.text.values[:5]

array([' Can they please get LeBron to travel back there, then? ',
       ' So what happens when a Republican elected Presidents wins both? The Dems will still scream foul!',
       ' No get rid of Mr Glass aka G. Stanton',
       '    Well, I think that was ONE of the plots. Game of Thrones has a lot of plots.',
       'Ive just sat and watched them all in one go. Really interesting.   '],
      dtype=object)

In [26]:
df_males.text.values[:5]

array([' We stan a flawless queen',
       '  He is simply parroting something he read while on the toilet. Where most of his tweets are penned.',
       '       … ', ' Good job ya din’t blink!',
       'i don’t understand tik tok'], dtype=object)

In [33]:
mfile = DATAPATH+'males.csv'
ffile = DATAPATH+'females.csv'
df_males.to_csv(mfile,index_label='id')
df_females.to_csv(ffile,index_label='id')

In [34]:
!cp $mfile $XFERPATH 
!cp $ffile $XFERPATH 