# Can we predict a person's gender based on their tweet text?

Steps:
1. Pull twitter data 
2. Store into dataframe
3. Classify male/female
4. Run model
5. Test with real tweet

In [1]:
import numpy as np
import pandas as pd
import tweepy as tw
import json
import pprint

In [2]:
path_auth = '/Users/allenj/Documents/Keys/auth_twitter.json'
auth = json.loads(open(path_auth).read())
pp = pprint.PrettyPrinter(indent=4)

my_consumer_key = auth['my_consumer_key']
my_consumer_secret = auth['my_consumer_secret']
my_access_token = auth['your_access_token']
my_access_token_secret = auth['my_access_token_secret']

In [3]:
auth = tw.OAuthHandler(my_consumer_key, my_consumer_secret)
auth.set_access_token(my_access_token, my_access_token_secret)
api = tw.API(auth)

In [4]:
type(api)

tweepy.api.API

In [14]:
# Get tweets
tweets = api.user_timeline(screen_name="HillaryClinton", count=5, include_rts=False)
for tweet in tweets:
    print(tweet.text)

Both vulnerable communities and those in positions of power are indebted to the activists who push for progress. La… https://t.co/a1acpcEiin
We can protect public health and our democracy. 

We'll do it by filing lawsuits now to expand and safeguard vote b… https://t.co/2dRd3hu5bK
Voting by mail has long been the norm in states like Oregon, Washington, &amp; Utah. It's easy and secure. 

Trump hims… https://t.co/D5GmENkroI
1. Provide free or prepaid postage so no one has to pay to vote.

2. Count ballots postmarked on or before Election… https://t.co/AsZesb80Ex
We have the data to guide us not only to implement voting by mail, but to make it secure and effective.

It comes d… https://t.co/MAgCyvT5Mv


In [16]:
tweets = api.user_timeline(screen_name="BarackObama", count=5, include_rts=False)
for tweet in tweets:
    print(tweet.text)

If you believe in a more just, more generous, more democratic America, now is the time to fight for it. Here’s a te… https://t.co/wjDlgys1Uc
On Memorial Day, we honor those who gave all for us. That takes different forms this year, but it’s even more vital… https://t.co/wUBXJWWoKs
And here’s more on the approach Sweden has taken, which differs from some of its neighbors: 

https://t.co/Qw5R0O5RhM
South Korea has focused on testing to guard against outbreaks:

https://t.co/51h11Pb3HZ
As all 50 states begin the process of reopening, here are three articles that offer some lessons from other countri… https://t.co/k6xeoaH9zm


In [62]:
# Define search term
search_words = "#wildfires"
date_since = "2018-11-16"

# Collect tweets
tweets = tw.Cursor(api.search,
              q=search_words,
              lang="en",
              since=date_since).items(5)
tweets

<tweepy.cursor.ItemIterator at 0x1213f7690>

In [63]:
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

RT @Univ_inenglish: The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican author…
Looking for family projects as you #StayAtHome? Here are 30 #EmergencyPreparedness tips for #hurricanes, #tornadoes… https://t.co/4hOHdo2OL2
The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican au… https://t.co/2DNIbzQrk6
RT @tamraraven: #10km3x2 #COVID19 DEAD 100,000 #exponentialGROWTH #K12 #climateChange #wildfires destroyslocal #biodiversity YOU #WorkFromH…
#10km3x2 #COVID19 DEAD 100,000 #exponentialGROWTH #K12 #climateChange #wildfires destroyslocal #biodiversity YOU… https://t.co/TeMQIwnpP1


In [65]:
# Collect tweets
tweets = tw.Cursor(api.search,
                       q=search_words,
                       lang="en",
                       since=date_since).items(5)

# Collect a list of tweets
[tweet.text for tweet in tweets]

['RT @Univ_inenglish: The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican author…',
 'Looking for family projects as you #StayAtHome? Here are 30 #EmergencyPreparedness tips for #hurricanes, #tornadoes… https://t.co/4hOHdo2OL2',
 'The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican au… https://t.co/2DNIbzQrk6',
 'RT @tamraraven: #10km3x2 #COVID19 DEAD 100,000 #exponentialGROWTH #K12 #climateChange #wildfires destroyslocal #biodiversity YOU #WorkFromH…',
 '#10km3x2 #COVID19 DEAD 100,000 #exponentialGROWTH #K12 #climateChange #wildfires destroyslocal #biodiversity YOU… https://t.co/TeMQIwnpP1']

In [80]:
# Collect into dataframe
tweets = tw.Cursor(api.search, 
                           q=search_words,
                           lang="en",
                           since=date_since).items(5)

users_text = [[tweet.user.screen_name, tweet.text] for tweet in tweets]
users_text

[['AntheralHazel',
  'RT @Univ_inenglish: The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican author…'],
 ['El_Universal_Mx',
  'RT @Univ_inenglish: The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican author…'],
 ['cassandra17lina',
  'Looking for family projects as you #StayAtHome? Here are 30 #EmergencyPreparedness tips for #hurricanes, #tornadoes… https://t.co/4hOHdo2OL2'],
 ['Univ_inenglish',
  'The best way to fight a #fire is with prevention since 98% of them are caused by human activity. Today, #Mexican au… https://t.co/2DNIbzQrk6'],
 ['jc_james_clark',
  'RT @tamraraven: #10km3x2 #COVID19 DEAD 100,000 #exponentialGROWTH #K12 #climateChange #wildfires destroyslocal #biodiversity YOU #WorkFromH…']]

In [81]:
tweet_text = pd.DataFrame(data=users_text, 
                    columns=['user', "text"])
tweet_text

Unnamed: 0,user,text
0,AntheralHazel,RT @Univ_inenglish: The best way to fight a #f...
1,El_Universal_Mx,RT @Univ_inenglish: The best way to fight a #f...
2,cassandra17lina,Looking for family projects as you #StayAtHome...
3,Univ_inenglish,The best way to fight a #fire is with preventi...
4,jc_james_clark,"RT @tamraraven: #10km3x2 #COVID19 DEAD 100,000..."


In [73]:
# Now do it based off of users

In [79]:
# Collect tweets
tweets = api.user_timeline(screen_name="BarackObama", count=5, include_rts=False)
tweets

[Status(_api=<tweepy.api.API object at 0x120c8c590>, _json={'created_at': 'Wed May 27 15:00:15 +0000 2020', 'id': 1265659084524728321, 'id_str': '1265659084524728321', 'text': 'If you believe in a more just, more generous, more democratic America, now is the time to fight for it. Here’s a te… https://t.co/wjDlgys1Uc', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/wjDlgys1Uc', 'expanded_url': 'https://twitter.com/i/web/status/1265659084524728321', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 813286, 'id_str': '813286', 'name': 'Barack Obama', 'screen_name': 'BarackObama', 'location': 'Washington, DC', 'description': 'Dad, husband, Presi

In [77]:
# Iterate and print tweets
for tweet in tweets:
    print(tweet.text)

If you believe in a more just, more generous, more democratic America, now is the time to fight for it. Here’s a te… https://t.co/wjDlgys1Uc
On Memorial Day, we honor those who gave all for us. That takes different forms this year, but it’s even more vital… https://t.co/wUBXJWWoKs
And here’s more on the approach Sweden has taken, which differs from some of its neighbors: 

https://t.co/Qw5R0O5RhM
South Korea has focused on testing to guard against outbreaks:

https://t.co/51h11Pb3HZ
As all 50 states begin the process of reopening, here are three articles that offer some lessons from other countri… https://t.co/k6xeoaH9zm


In [78]:
# Collect a list of tweets
tweets = api.user_timeline(screen_name="BarackObama", count=5, include_rts=False)

[tweet.text for tweet in tweets]

['If you believe in a more just, more generous, more democratic America, now is the time to fight for it. Here’s a te… https://t.co/wjDlgys1Uc',
 'On Memorial Day, we honor those who gave all for us. That takes different forms this year, but it’s even more vital… https://t.co/wUBXJWWoKs',
 'And here’s more on the approach Sweden has taken, which differs from some of its neighbors: \n\nhttps://t.co/Qw5R0O5RhM',
 'South Korea has focused on testing to guard against outbreaks:\n\nhttps://t.co/51h11Pb3HZ',
 'As all 50 states begin the process of reopening, here are three articles that offer some lessons from other countri… https://t.co/k6xeoaH9zm']

In [5]:
tweets = api.user_timeline(screen_name="BarackObama", count=10, include_rts=False)
users_text = [[tweet.user.screen_name, tweet.text] for tweet in tweets]
tweet_text = pd.DataFrame(data=users_text, 
                    columns=['user', "text"])
tweet_text

Unnamed: 0,user,text
0,BarackObama,"If you believe in a more just, more generous, ..."
1,BarackObama,"On Memorial Day, we honor those who gave all f..."
2,BarackObama,And here’s more on the approach Sweden has tak...
3,BarackObama,South Korea has focused on testing to guard ag...
4,BarackObama,As all 50 states begin the process of reopenin...
5,BarackObama,The Class of 2020 is full of the leaders we ne...
6,BarackObama,"As Chicago navigates the health crisis, its re..."
7,BarackObama,"Congrats to the high school Class of 2020, as ..."
8,BarackObama,Congratulations to the HBCU Class of 2020! Mic...


In [8]:
# Screen names of most popular twitter accounts with gender
# 0 = male, 1 = female
users = [['jimmyfallon',0],
         ['shakira',1],
         ['ddlovato',1],
         ['britneyspears',1],
         ['narendramodi',0],
         ['selenagomez',1],
         ['jtimberlake',0],
         ['kimkardashian',1],
         ['arianagrande',1],
         ['realdonaldtrump',0],
         ['theellenshow',1],
         ['ladygaga',1],
         ['cristiano',0],
         ['taylorswift13',1],
         ['rihanna',1],
         ['katyperry',1],
         ['justinbieber',0],
         ['barackobama',0]]

In [61]:
# Convert user list into dataframe
users_df = pd.DataFrame(users, columns=["user", "gender"])
user_only = users_df["user"]
user_only

0         jimmyfallon
1             shakira
2            ddlovato
3       britneyspears
4        narendramodi
5         selenagomez
6         jtimberlake
7       kimkardashian
8        arianagrande
9     realdonaldtrump
10       theellenshow
11           ladygaga
12          cristiano
13      taylorswift13
14            rihanna
15          katyperry
16       justinbieber
17        barackobama
Name: user, dtype: object

In [62]:
for i in user_only: 
    print(i)

jimmyfallon
shakira
ddlovato
britneyspears
narendramodi
selenagomez
jtimberlake
kimkardashian
arianagrande
realdonaldtrump
theellenshow
ladygaga
cristiano
taylorswift13
rihanna
katyperry
justinbieber
barackobama


In [72]:
def get_tweets(users):
    list = []
    for i in users:
    get_tweets = api.user_timeline(screen_name=i, count=10, include_rts=False)
    
print(get_tweets(user_only))
    
    

IndentationError: expected an indented block (<ipython-input-72-d938c20f950b>, line 4)

In [70]:
def start(B):
    list = []
    for w in B:
        content = w
        words = content.lower().split()
        for each_word in words:
            
            wordlist.append(each_word)
            
            print(wordlist)
            
B=['Hello Bye Poop']
start(B)

NameError: name 'wordlist' is not defined

In [64]:
tweets = api.user_timeline(screen_name=i, count=10, include_rts=False)
tweets

[Status(_api=<tweepy.api.API object at 0x1141d79d0>, _json={'created_at': 'Wed May 27 15:00:15 +0000 2020', 'id': 1265659084524728321, 'id_str': '1265659084524728321', 'text': 'If you believe in a more just, more generous, more democratic America, now is the time to fight for it. Here’s a te… https://t.co/wjDlgys1Uc', 'truncated': True, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [{'url': 'https://t.co/wjDlgys1Uc', 'expanded_url': 'https://twitter.com/i/web/status/1265659084524728321', 'display_url': 'twitter.com/i/web/status/1…', 'indices': [117, 140]}]}, 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 813286, 'id_str': '813286', 'name': 'Barack Obama', 'screen_name': 'BarackObama', 'location': 'Washington, DC', 'description': 'Dad, husband, Presi

In [45]:
# loop over user dataframes 
for i in users_df["user"]:
    print(i)

tweets = api.user_timeline(screen_name=i, count=10, include_rts=False)
users_text = [[tweet.user.screen_name, tweet.text] for tweet in tweets]
tweet_text = pd.DataFrame(data=users_text, 
                    columns=['user', "text"])
tweet_text

jimmyfallon
shakira
ddlovato
britneyspears
narendramodi
selenagomez
jtimberlake
kimkardashian
arianagrande
realdonaldtrump
theellenshow
ladygaga
cristiano
taylorswift13
rihanna
katyperry
justinbieber
barackobama


Unnamed: 0,user,text
0,BarackObama,"If you believe in a more just, more generous, ..."
1,BarackObama,"On Memorial Day, we honor those who gave all f..."
2,BarackObama,And here’s more on the approach Sweden has tak...
3,BarackObama,South Korea has focused on testing to guard ag...
4,BarackObama,As all 50 states begin the process of reopenin...
5,BarackObama,The Class of 2020 is full of the leaders we ne...
6,BarackObama,"As Chicago navigates the health crisis, its re..."
7,BarackObama,"Congrats to the high school Class of 2020, as ..."
8,BarackObama,Congratulations to the HBCU Class of 2020! Mic...


In [None]:
# Create a function that collects the most recent 100 tweets from user list and save to dataframe
# Add additional column in dataframe with gender classification, either 0 or 1

def get_tweets(users, ):
                




In [124]:
users = [['jimmyfallon',0],
 ['shakira',1],
 ['ddlovato',1],
 ['britneyspears',1]]

In [130]:
users = pd.DataFrame

In [133]:
users[0]

TypeError: 'type' object is not subscriptable

In [None]:

 'jimmyfallon',0
 'shakira',1
 'ddlovato',1
 'britneyspears',1
 'narendramodi',0
 'selenagomez',1
 'jtimberlake',0
 'kimkardashian',1
 'arianagrande',1
 'realdonaldtrump',0
 'theellenshow',1
 'ladygaga',1
 'cristiano',0
 'taylorswift13',1
 'rihanna',1
 'katyperry',1
 'justinbieber',0
 'barackobama',0