In [227]:
import numpy as np
import pandas as pd
import re
import uuid

import matplotlib.pyplot as plt
import matplotlib as mpl

import seaborn as sns
sns.set()

%matplotlib inline

***
# Initialize and perform base level cleaning #

In [99]:
cleaned_csv_location = '../cleaned_tweets.csv'
def save_cleaned_df(df):
    df.to_csv(cleaned_csv_location)
    
def read_cleaned_df():    
    return pd.read_csv(cleaned_csv_location, index_col='id')

In [205]:
#tweet_id needs to be an object because pandas tries to convert it to scientific notation
russia_df = pd.read_csv('../tweets.csv', dtype={'tweet_id': np.object})

russia_df['created_datetime'] = pd.to_datetime(russia_df['created_str'])
# converts from UTC to EST
russia_df['created_datetime'] = russia_df['created_datetime'] - pd.Timedelta(hours=5)
# Make the created_datetime the index
#russia_df.set_index('created_datetime', inplace=True)

# Delete columns we wont use
del russia_df['created_str']
del russia_df['created_at']
del russia_df['posted']
# I parse these out myself later in this notebook
del russia_df['hashtags']
del russia_df['mentions']

# There are 21 null text tweets. Delete for now, but amy come back and look later
russia_df = russia_df[~russia_df['text'].isnull()]

russia_df['id'] = [uuid.uuid4() for _ in range(0,russia_df.shape[0])]
russia_df.set_index('id',inplace=True)
save_cleaned_df(russia_df)

russia_df.sample(5)

Unnamed: 0_level_0,user_id,user_key,retweet_count,retweeted,favorite_count,text,tweet_id,source,expanded_urls,retweeted_status_id,in_reply_to_status_id,created_datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
b61b5d8f-b063-41d6-bb75-bb23337deac6,1671235000.0,hyddrox,0.0,False,0.0,RT @luvGodncountry: WHERE WERE THEY AGAINST OB...,784115860910776321,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",[],7.841117e+17,,2016-10-06 14:39:24
9c21de1f-2cea-4130-9f0c-254f029b00f7,1659754000.0,karenparker93,0.0,False,0.0,RT @Just_A_Bill_: 'Beyond Tolerance': The Delu...,777965721171156992,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","[""http://www.frontpagemag.com/fpm/264223/beyon...",7.779638e+17,,2016-09-19 15:20:56
d4e1f5c4-ae12-4068-932d-0f351a574c02,1657754000.0,johnbranchh,,,,"These words arrive here in an instant, from so...",556044484098539521,,[],,,2015-01-16 06:05:10
99f4b3c9-a101-489f-9555-5d10b157a106,2882014000.0,giselleevns,0.0,False,0.0,RT @HassanMoKhan: Trump gets a good deal on a ...,765917879971840000,"<a href=""https://about.twitter.com/products/tw...",[],7.65917e+17,,2016-08-17 09:27:07
113ddc8e-cb66-4ddb-a95c-e6398ce2dfaf,1680366000.0,willisbonnerr,,,,RT @FreeLabelNet: #NP: RuBezzal108 - Ru Bezzal...,818782589729337345,,[],,,2017-01-10 06:32:36


***
# Extract hashtags #

In [206]:
tweets_with_hashtags = russia_df[russia_df['text'].str.contains('#\w+')]

print(tweets_with_hashtags.shape[0])
tweets_with_hashtags.sample(2)

100651


Unnamed: 0_level_0,user_id,user_key,retweet_count,retweeted,favorite_count,text,tweet_id,source,expanded_urls,retweeted_status_id,in_reply_to_status_id,created_datetime
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
e3bf78c4-9586-4c31-9b34-89aeb904bb68,1629828000.0,cassishere,0.0,False,0.0,RT @VoteTrumpMAGA: .\nA New Government...\n\n#...,777501002077392896,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",[],7.774676e+17,,2016-09-18 08:34:18
49ae5739-5257-4bab-b67e-3c5ee4ce5cb8,2882014000.0,giselleevns,,,,"#MakeMeHateYouInOnePhrase no, you can't leave ...",788006852474892288,,[],,,2016-10-17 08:20:48


In [207]:
hash_df = russia_df['text'].str.extractall('(#\w+)')

In [208]:
hash_df = hash_df.reset_index().set_index('id')
del hash_df['match']

hash_df.to_csv('../csvs/hashtags.csv')

In [209]:
hash_df[0].value_counts().head(10)

#politics             3638
#tcot                 2839
#MAGA                 2538
#PJNET                2147
#news                 2046
#Trump                1851
#Merkelmussbleiben    1108
#TrumpForPresident    1088
#WakeUpAmerica        1061
#NeverHillary          976
Name: 0, dtype: int64

***
# Extract mentions #

In [210]:
mentions_df = russia_df['text'].str.extractall('(@\w+)')
mentions_df = mentions_df.reset_index().set_index('id')
del mentions_df['match']

mentions_df.to_csv('../csvs/mentions.csv')

In [211]:
mentions_df[0].value_counts().head(10)

@realDonaldTrump    4567
@midnight           2584
@HillaryClinton     2343
@blicqer            2238
@Conservatexian     1105
@POTUS               909
@FoxNews             744
@YouTube             557
@PrisonPlanet        554
@nine_oh             538
Name: 0, dtype: int64

***
# source #

** 'source' is the client that performed the tweet (not consistently stamped) **

In [233]:
russia_df.sort_values('source', ascending=False)['source'].value_counts()[:5]

<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                     42685
<a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>                         6926
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>     6409
<a href="http://twibble.io" rel="nofollow">Twibble.io</a>                               1491
<a href="http://dlvr.it" rel="nofollow">dlvr.it</a>                                      243
Name: source, dtype: int64

***
# Extract URLs #

In [121]:
tweets_with_links = russia_df[russia_df['text'].str.contains('https')]

tweets_with_links.shape

(107267, 14)

In [122]:
tweets_with_links['text'].sample(5).values

array([ 'RT @ComplexMusic: Exclusive: @ImmortalTech opens up about his relationship with @Lin_Manuel and high school bullying https://t.co/9ZDmia9ll…',
       "RT @DRUDGE_REPORT: SUPPORTERS GATHER FOR 'MARCH 4 TRUMP' RALLIES AROUND USA... https://t.co/XTWJp4wWTP",
       'RT @blackcat9508: She can always play a live action Miss Piggy. Barbie should have been Taylor Swift🐷 https://t.co/Hd0ishNgCt',
       'RT @DarkVoid255: MT @iamgavinjames: This man was destroyed and disowned for far less - yet @HillaryClinton walks? https://t.co/owbyE5lpwD #…',
       "&amp; don't expect me to follow what your religion requires from you! Not gonna happen. \n#Trump2016\n#IslamIsTheProblem https://t.co/wUGBEeIDkt"], dtype=object)

In [128]:
#url_regex = '(https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
#tweets_with_links['text'].str.extractall(url_regex, expand=True);