In [278]:
import numpy as np
import pandas as pd
import re
import uuid
import requests

import matplotlib.pyplot as plt
import matplotlib as mpl

import seaborn as sns
sns.set()

%matplotlib inline

create_new_df = False

***
# Initialize and perform base level cleaning #

In [239]:
cleaned_csv_location = '../cleaned_tweets.csv'
def save_cleaned_df(df):
    df.to_csv(cleaned_csv_location)
    
def read_cleaned_df():    
    return pd.read_csv(cleaned_csv_location, index_col='id')

In [240]:
if create_new_df:
    #tweet_id needs to be an object because pandas tries to convert it to scientific notation
    russia_df = pd.read_csv('../tweets.csv', dtype={'tweet_id': np.object})

    russia_df['created_datetime'] = pd.to_datetime(russia_df['created_str'])
    # converts from UTC to EST
    russia_df['created_datetime'] = russia_df['created_datetime'] - pd.Timedelta(hours=5)
    # Make the created_datetime the index
    #russia_df.set_index('created_datetime', inplace=True)

    # Delete columns we wont use
    del russia_df['created_str']
    del russia_df['created_at']
    del russia_df['posted']
    # I parse these out myself later in this notebook
    del russia_df['hashtags']
    del russia_df['mentions']

    # There are 21 null text tweets. Delete for now, but amy come back and look later
    russia_df = russia_df[~russia_df['text'].isnull()]

    russia_df['id'] = [uuid.uuid4() for _ in range(0,russia_df.shape[0])]
    russia_df.set_index('id',inplace=True)
    save_cleaned_df(russia_df)

    russia_df.sample(5)
else:
    russia_df = read_cleaned_df()

***
# Extract hashtags #

In [241]:
tweets_with_hashtags = russia_df[russia_df['text'].str.contains('#\w+')]

print(tweets_with_hashtags.shape[0])
tweets_with_hashtags.sample(2)

100651


Unnamed: 0_level_0,user_id,user_key,retweet_count,retweeted,favorite_count,text,tweet_id,source,expanded_urls,retweeted_status_id,in_reply_to_status_id,created_datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
552b9190-e867-47a2-b1cc-2ccea5971d26,2591848000.0,todaybostonma,0.0,False,0.0,The politics behind Obama's Christian Siriano ...,7.58037e+17,"<a href=""http://twitterfeed.com"" rel=""nofollow...",[],,,2016-07-26 15:31:27
95967fff-2596-4a19-9ca6-84b44b311cc8,2531160000.0,traceyhappymom,,,,RT @Vapo_Rob: Democracy tainted forever. #201...,7.956458e+17,,[],,,2016-11-07 10:15:24


In [207]:
hash_df = russia_df['text'].str.extractall('(#\w+)')

In [208]:
hash_df = hash_df.reset_index().set_index('id')
del hash_df['match']

hash_df.to_csv('../csvs/hashtags.csv')

In [209]:
hash_df[0].value_counts().head(10)

#politics             3638
#tcot                 2839
#MAGA                 2538
#PJNET                2147
#news                 2046
#Trump                1851
#Merkelmussbleiben    1108
#TrumpForPresident    1088
#WakeUpAmerica        1061
#NeverHillary          976
Name: 0, dtype: int64

***
# Extract mentions #

In [210]:
mentions_df = russia_df['text'].str.extractall('(@\w+)')
mentions_df = mentions_df.reset_index().set_index('id')
del mentions_df['match']

mentions_df.to_csv('../csvs/mentions.csv')

In [211]:
mentions_df[0].value_counts().head(10)

@realDonaldTrump    4567
@midnight           2584
@HillaryClinton     2343
@blicqer            2238
@Conservatexian     1105
@POTUS               909
@FoxNews             744
@YouTube             557
@PrisonPlanet        554
@nine_oh             538
Name: 0, dtype: int64

***
# source #

** 'source' is the client that performed the tweet (not consistently stamped) **

In [233]:
russia_df.sort_values('source', ascending=False)['source'].value_counts()[:5]

<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                     42685
<a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>                         6926
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>     6409
<a href="http://twibble.io" rel="nofollow">Twibble.io</a>                               1491
<a href="http://dlvr.it" rel="nofollow">dlvr.it</a>                                      243
Name: source, dtype: int64

***
# Extract URLs #
This doesn't get all of the urls, but it gets most of them. Some are truncated (seems to be the case for RTs)

In [269]:
url_regex = '(https?:\/\/t.co\/[a-zA-Z0-9\-\.]{8})'

url_df = russia_df['text'].str.extractall(url_regex)
url_df = url_df.reset_index().set_index('id')
del url_df['match']

url_df.to_csv('../csvs/urls.csv')

In [270]:
url_df[0].value_counts().head(10)

https://t.co/eMX9JgTv    269
https://t.co/aXQaNDGl    163
https://t.co/iChLzWqg    146
https://t.co/rRZgBcCx    145
https://t.co/XDBaPnHm    127
https://t.co/telXZBWP    123
https://t.co/rRZgBcU8    114
https://t.co/yX3iMOFI    104
https://t.co/1KPXto2H     99
https://t.co/cdnQqWB3     98
Name: 0, dtype: int64

In [None]:
unwrapped_urls = url_df[0].apply(lambda x: requests.get(x).url)

In [277]:
urllib.urlopen('https://t.co/aXQaNDGl').url

AttributeError: module 'urllib' has no attribute 'urlopen'