In [None]:
import pandas as pd
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)

**Read in file**

In [None]:
column_names=['Name', 'ScreenName', 'UserID', 'FollowersCount', 'FriendsCount', 'Location', 'Description', 'CreatedAt', 'StatusID', 'Language', 'Place', 'RetweetCount', 'FavoriteCount', 'Text']
tweets = pd.read_csv('clinton_trump_tweets.txt', sep="\t",  encoding="ISO-8859-1", header=None, names=column_names)
#print initial data
tweets.head()

**Filter out retweets*
Tweets that start with 'RT'

In [None]:
#1.1 filter retweets
tweets = tweets[~tweets.Text.str.startswith("RT")]

In [None]:
def keepHandlesAndMention(text):
    handlesAndMentions = []
    words =  text.split()
    for word in words:
        if((word.startswith('@') or word.startswith('#')) and len(word) > 1):
            handlesAndMentions.append(word)
    return ' '.join(handlesAndMentions)
tweets['TextMentions'] = tweets.Text.apply(keepHandlesAndMention)

**Filter out tweets that have mention/hashtag < 20

In [None]:
allHandles=[]
for text in tweets.TextMentions:
    allHandles.extend(text.split())
top_hash = pd.Series(allHandles).value_counts()
top20 = top_hash[top_hash>=20]
top20List = top20.index.tolist()


In [None]:
# tweets[tweets.TextMentions.str.map(lambda item: item.split(',').foreach(x: x in ))]
def isInList(list1,list2):
    for item1 in list1:
        if(item1 in list2 ):
            return True
    return False
tweets = tweets[tweets.TextMentions.apply(lambda x : any(elem in top20List for elem in x))]


In [None]:
#1.1 remove tweets that don't have any @ or #
#tweets["CleanText"] = tweets.Text.str.extract('((@|#)\w+)')
#use this to get all @ # from tweet, extract just gets first occurance
#.groupby(level=0)[0].apply(' '.join)
# tweets = tweets[tweets.CleanText.notnull()]
# tweets.head()

**Filter out users that have less than 20 tweets

In [None]:
#1.2 keep tweets where UID appears 20+ times
tweets = tweets.groupby("UserID").filter(lambda x: len(x) >= 20)

**Plot top 30 locations

In [None]:
#2.1
tweetsTop30Locations = tweets[tweets.Location.isin(tweets.Location.value_counts().nlargest(30).index.tolist())]

In [None]:
tweetsTop30Locations.groupby("Location").Location.count().plot(kind='bar')

**Make word cloud of the top 3 locations

In [None]:
tweetsTop3Locations = tweets[tweets.Location.isin(tweets.Location.value_counts().nlargest(3).index.tolist())]
text = " ".join(tweet for tweet in tweetsTop3Locations.Text)


In [None]:
# Create stopword list:
stopwords = set(STOPWORDS)
stopwords.update(["go","ye","thing","well","big","us","great","https", "still", "need", "co", "one","will","Thank","know","going","lol","good", "take","even","really","now"])


# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords).generate(text)

# Display the generated image:
plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()