# analysis of the users' tweets to identify their sex

## 1. twitter-level pipeline

In [53]:
from toolz import pipe
import twitter
from multiprocessing import Pool
from toolz import compose

In [2]:
# autenthicates the twitter app
Twitter = twitter.Api(consumer_key="",
                     consumer_secret="",
                     access_token_key="",
                     access_token_secret="")

In [3]:
def get_tweet_from_id(tweet_id, api=Twitter):
    """
    Uses the twitter app to look up tweets by their ID
    """
    return api.GetStatus(tweet_id, trim_user=True)
    

In [37]:
def tweet_to_text(tweet):
    """
    Gets the text from a tweet object
    """
    return tweet.text.lower()

In [5]:
def tokenize_text(text):
    """
    Splits text on white space so we can analyze words
    """
    return text.split()
    

In [6]:
def score_text(tokens):
    """
    Creates our text scoring function
    """
    # Creates a mini sample lexicon for scoring words
    lexicon = {"the":1, "to":1, "and":1,
               "in":1, "have":1, "it":1,
               "be":-1, "of":-1, "a":-1,
               "that":-1, "i":-1, "for":-1}
    
    return sum(map(lambda x: lexicon.get(x, 0), tokens))

In [7]:
def score_tweet(tweet_id):
    """
    Pipes a tweet through our pipeline
    """
    return pipe(tweet_id, get_tweet_from_id, tweet_to_text, tokenize_text, score_text)

## user-level pipeline

In [66]:
def score_user(tweets):
    """
    Averages the scores of all of a userâ€™s tweets 
    """
    print(tweets)
    print("\n")
    N = len(tweets)
    total = sum(map(score_tweet, tweets))
    return total / N
    

In [64]:
def categorize_user(user_score):
    """
    If the user_score is greater than 0, weâ€™ll say that the user is male.
    """
    if user_score > 0:
        return {"score": user_score,
                "gender": "Male"}
    return {"score": user_score,
                "gender": "Female"}

## main

In [67]:
# some users' tweets ids
users_tweets_1 = [1056365937547534341, 1056310126255034368, 1055985345341251584,1056585873989394432, 1056585871623966720, 1055986452612419584, 1056318330037002240, 1055957256162942977, 1056585921154420736, 1056585896898805766,
                1056367465477951490]

users_tweets_2 = [[1056365937547534341, 1056310126255034368], [1055985345341251584,1056585873989394432], [1056585871623966720, 1055986452612419584], [1056318330037002240, 1055957256162942977], [1056585921154420736, 1056585896898805766,
                1056367465477951490]]

gender_prediction_pipeline = compose(categorize_user, score_user)

with Pool() as P:
    print(P.map(gender_prediction_pipeline, users_tweets_2))

[1056365937547534341, 1056310126255034368]


[1055985345341251584, 1056585873989394432]


[1056585871623966720, 1055986452612419584]


[1056318330037002240, 1055957256162942977]


[1056585921154420736, 1056585896898805766, 1056367465477951490]


[{'score': -0.5, 'gender': 'Female'}, {'score': 0.0, 'gender': 'Female'}, {'score': 2.0, 'gender': 'Male'}, {'score': 1.0, 'gender': 'Male'}, {'score': 0.3333333333333333, 'gender': 'Male'}]


In [47]:
# some users' tweets ids
users_tweets = [1056365937547534341, 1056310126255034368, 1055985345341251584,1056585873989394432, 1056585871623966720, 1055986452612419584, 1056318330037002240, 1055957256162942977, 1056585921154420736, 1056585896898805766,
                1056367465477951490]

tweet = get_tweet_from_id(users_tweets[0])
print(tweet)
print("**********\n")

tweet_text = tweet_to_text(tweet)
print(tweet_text)
print("**********\n")

tweet_tokenized = tokenize_text(tweet_text)
print(tweet_tokenized)
print("**********\n")


tweet_score = score_text(tweet_tokenized)
print(tweet_score)

# with Pool as P:
#     print(P.map(score))

{"created_at": "Sun Oct 28 02:04:00 +0000 2018", "favorite_count": 25729, "hashtags": [], "id": 1056365937547534341, "id_str": "1056365937547534341", "lang": "en", "media": [{"display_url": "pic.twitter.com/7CIhHTAUEA", "expanded_url": "https://twitter.com/dodo/status/1056365937547534341/video/1", "id": 989875325545857024, "media_url": "http://pbs.twimg.com/media/Dby-qeEVMAAc0AU.jpg", "media_url_https": "https://pbs.twimg.com/media/Dby-qeEVMAAc0AU.jpg", "sizes": {"large": {"h": 1080, "resize": "fit", "w": 1080}, "medium": {"h": 1080, "resize": "fit", "w": 1080}, "small": {"h": 680, "resize": "fit", "w": 680}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "type": "video", "url": "https://t.co/7CIhHTAUEA", "video_info": {"aspect_ratio": [1, 1], "duration_millis": 19820, "variants": [{"bitrate": 1280000, "content_type": "video/mp4", "url": "https://video.twimg.com/amplify_video/989875325545857024/vid/720x720/4Vf67lGbz63mvoR8.mp4?tag=2"}, {"bitrate": 320000, "content_type": "video/mp4"

In [26]:
# the list of all the tweets 

for i in range(len(users_tweets)):
        tweet = get_tweet_from_id(users_tweets[i])
        print("********** {}\n".format(i))
        print(tweet)
        print("**********\n")
                                  

********** 0

{"created_at": "Sun Oct 28 02:04:00 +0000 2018", "favorite_count": 25729, "hashtags": [], "id": 1056365937547534341, "id_str": "1056365937547534341", "lang": "en", "media": [{"display_url": "pic.twitter.com/7CIhHTAUEA", "expanded_url": "https://twitter.com/dodo/status/1056365937547534341/video/1", "id": 989875325545857024, "media_url": "http://pbs.twimg.com/media/Dby-qeEVMAAc0AU.jpg", "media_url_https": "https://pbs.twimg.com/media/Dby-qeEVMAAc0AU.jpg", "sizes": {"large": {"h": 1080, "resize": "fit", "w": 1080}, "medium": {"h": 1080, "resize": "fit", "w": 1080}, "small": {"h": 680, "resize": "fit", "w": 680}, "thumb": {"h": 150, "resize": "crop", "w": 150}}, "type": "video", "url": "https://t.co/7CIhHTAUEA", "video_info": {"aspect_ratio": [1, 1], "duration_millis": 19820, "variants": [{"bitrate": 1280000, "content_type": "video/mp4", "url": "https://video.twimg.com/amplify_video/989875325545857024/vid/720x720/4Vf67lGbz63mvoR8.mp4?tag=2"}, {"bitrate": 320000, "content_type