## CSC 177-02 Data Warehousing and Data Mining
### Mini-Project 1: Clustering
### 2016 US presedential election Twitter analysis

#### Group members: Aaron Enberg,

In [1]:
import pandas as pd


In [2]:
column_names = ['Name', 'screen_Name', 'User_ID', 
                'Followers_Count', 'Friends_Count', 
                'Location', 'Description', 'Created_At', 
                'Status_ID', 'Language', 'Place', 
                'Retweet_Count', 'Favorite_Count', 'Text']
tweets = pd.read_table('data/clinton_trump_tweets.txt', names=column_names, encoding='ISO-8859-1')
tweets.columns = tweets.columns.str.lower()

In [3]:
tweets.head()

Unnamed: 0,name,screen_name,user_id,followers_count,friends_count,location,description,created_at,status_id,language,place,retweet_count,favorite_count,text
0,Cebel,Cebel6,1519696717,132,263,"Little Rock, Arkansas",Arkansas Razorback Fan Just trying to be #Unco...,Sat Oct 29 08:10:06 EEST 2016,792232017094119425,en,,0,1,@NWAJimmy I've read it now though brother. Was...
1,Cookie,Cookiemuffen,109945090,2154,2034,The American South,Got married after college. I don't regret star...,Wed Oct 26 18:44:08 EEST 2016,791304413923213312,en,,1937,0,RT @wikileaks: New poll puts Pirate Party on c...
2,nolaguy,nolaguy_phd,1450086582,797,1188,,"An LSU Ph.D student living in New Orleans, try...",Sat Oct 29 21:53:29 EEST 2016,792439227090767872,en,,0,0,@gaystoner821 I think New Orleans spoiled me w...
3,Mark Hager,marksnark,167177185,204,448,Pittsburgh,"Hip, trendy, smart, funny, fit, lobbyist. U? B...",Wed Oct 26 00:33:20 EEST 2016,791029904733331457,en,,891,0,RT @LOLGOP: ACA needs fixes but know da facts:...
4,Capitalist Creations,aaronjhoddinott,1191022351,775,154,Canada,"Entrepreneur, startup investor, political junk...",Fri Oct 28 05:05:10 EEST 2016,791823089700962304,en,,7,0,RT @FastCompany: Alphabet shares soar on bette...


In [4]:
tweets.shape

(5250980, 14)

In [5]:
tweets.dtypes

name               object
screen_name        object
user_id             int64
followers_count     int64
friends_count       int64
location           object
description        object
created_at         object
status_id           int64
language           object
place              object
retweet_count       int64
favorite_count      int64
text               object
dtype: object

## Preprocessing

In [6]:
pattern = r'^RT'

# matches retweets and removes them
tweets = tweets[tweets.text.str.match(pattern) == False]


In [7]:
tweets.head()

Unnamed: 0,name,screen_name,user_id,followers_count,friends_count,location,description,created_at,status_id,language,place,retweet_count,favorite_count,text
0,Cebel,Cebel6,1519696717,132,263,"Little Rock, Arkansas",Arkansas Razorback Fan Just trying to be #Unco...,Sat Oct 29 08:10:06 EEST 2016,792232017094119425,en,,0,1,@NWAJimmy I've read it now though brother. Was...
2,nolaguy,nolaguy_phd,1450086582,797,1188,,"An LSU Ph.D student living in New Orleans, try...",Sat Oct 29 21:53:29 EEST 2016,792439227090767872,en,,0,0,@gaystoner821 I think New Orleans spoiled me w...
6,David Walling,davidjwalling,106568768,975,2781,"Dallas, TX",Bloodletting secure algorithms close to the bo...,Sat Oct 29 00:16:48 EEST 2016,792112907488079872,en,,0,0,#infosec #Intel #ACM #IEEE Impacts Haswell mi...
7,robert2266,robert2266,17101060,845,938,The Universe,The Dark Lord,Fri Oct 28 14:41:06 EEST 2016,791968028191711237,en,,0,0,Hacked e-mails show Clinton campaigns fears ab...
10,neddyo,neddyo,16818809,1400,379,Long Island and beyond...,You should be digging it while it's happening ...,Mon Oct 31 08:06:52 EET 2016,792971077836124160,en,,0,1,Hulk smash!


In [8]:
# match all hashtags and mentions in a tweet, ignoring possible email addresses
pattern = r'(?<=^|(?<=[^a-zA-Z0-9-\.]))@([A-Za-z_]+[A-Za-z0-9_]+)|(?<=^|(?<=[^a-zA-Z0-9-\.]))#([A-Za-z_]+[A-Za-z0-9_]+)'

""" extractall() returns a DataFrame with a MultiIndex:
    First index is our original index. Second index is "match" which is a running
    total of the number of occurences of hashtags and mentions for a particular 
    tweet. So, a match = 0 does NOT mean there are no matches, actually it's the 
    first occurence of a hashtag or mention found in the tweet (index starts from 0)  """
mention_hashtag = tweets.text.str.extractall(pattern)

In [9]:
mention_hashtag.columns = ['mentions', 'hashtags']
mention_hashtag.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,mentions,hashtags
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,NWAJimmy,
2,0,gaystoner821,
6,0,,infosec
6,1,,Intel
6,2,,ACM


In [10]:
''' Turn the MultiIndex Dataframe into a regular single index with 
    same index from training data. This is now our basket of tweets
    containing atleast one mention or hastag '''
mention_hashtag = mention_hashtag.reset_index().set_index('level_0')
del mention_hashtag.index.name

In [11]:
mention_hashtag

Unnamed: 0,match,mentions,hashtags
0,0,NWAJimmy,
2,0,gaystoner821,
6,0,,infosec
6,1,,Intel
6,2,,ACM
6,3,,IEEE
12,0,Haylie_Bre,
13,0,WayneDupreeShow,
13,1,,climatechange
22,0,tansleyemiley69,


In [12]:
''' We can find all users who have used mentions or hastags atleast 20 times
    by filtering on match column '''
mention_hashtag_20 = mention_hashtag[mention_hashtag.match >= 19]
mention_hashtag_20

Unnamed: 0,match,mentions,hashtags
2232,19,col_nj,
2232,20,,wednesday
2232,21,paulkrugman,
2232,22,BringerOfRain88,
2232,23,,DNCleak
2232,24,,BernieMustDisavow
2232,25,p_cattt,
2232,26,AndyWHumphreys,
2232,27,realDonaldTrump,
2232,28,,VoteBLUE


In [13]:
# count up all the occurences for each distinct mention
count_mentions = mention_hashtag.mentions.value_counts()
# turn the series back into a dataframe 
count_mentions = count_mentions.reset_index(name="count")
# retrieve only those mentions having atleast 20 occurences 
count_mentions.query("count >= 19")

Unnamed: 0,index,count
0,YouTube,33840
1,realDonaldTrump,31471
2,HillaryClinton,24543
3,FoxNews,11448
4,megynkelly,8515
5,CNN,8363
6,newtgingrich,4872
7,seanhannity,4510
8,c0nvey,4247
9,wikileaks,4080


In [14]:
# do the same thing for the hashtags
count_hashtags = mention_hashtag.hashtags.value_counts()
count_hashtags = count_hashtags.reset_index(name="count") 
count_hashtags.query("count >= 19")

Unnamed: 0,index,count
0,WorldSeries,9932
1,Trump,6688
2,quote,6274
3,MAGA,6074
4,news,6070
5,NowPlaying,4895
6,PJNET,4175
7,Hillary,3945
8,ImWithHer,3929
9,RallyTogether,3516


In [25]:
''' we only need to keep the users whose index is in mention_hashtag '''
result = pd.concat([tweets, mention_hashtag_20], axis=1, join_axes=[mention_hashtag_20.index])

In [26]:
result.head()

Unnamed: 0,name,screen_name,user_id,followers_count,friends_count,location,description,created_at,status_id,language,place,retweet_count,favorite_count,text,match,mentions,hashtags
2232,Colleen Allen,Colleen_Allen14,1561696464,292,1180,"Seattle, Washington",School Psychologist Social Justice Feminism ...,Sun Oct 30 23:53:14 EET 2016,792846850185236480,en,,0,1,I didnt quit Second City because of Donald Tru...,19,col_nj,
2232,Colleen Allen,Colleen_Allen14,1561696464,292,1180,"Seattle, Washington",School Psychologist Social Justice Feminism ...,Sun Oct 30 23:53:14 EET 2016,792846850185236480,en,,0,1,I didnt quit Second City because of Donald Tru...,20,,wednesday
2232,Colleen Allen,Colleen_Allen14,1561696464,292,1180,"Seattle, Washington",School Psychologist Social Justice Feminism ...,Sun Oct 30 23:53:14 EET 2016,792846850185236480,en,,0,1,I didnt quit Second City because of Donald Tru...,21,paulkrugman,
2232,Colleen Allen,Colleen_Allen14,1561696464,292,1180,"Seattle, Washington",School Psychologist Social Justice Feminism ...,Sun Oct 30 23:53:14 EET 2016,792846850185236480,en,,0,1,I didnt quit Second City because of Donald Tru...,22,BringerOfRain88,
2232,Colleen Allen,Colleen_Allen14,1561696464,292,1180,"Seattle, Washington",School Psychologist Social Justice Feminism ...,Sun Oct 30 23:53:14 EET 2016,792846850185236480,en,,0,1,I didnt quit Second City because of Donald Tru...,23,,DNCleak
