### Imports

In [118]:
import tweepy
from pymongo import MongoClient
import json
import datetime
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from collections import Counter
import networkx as nx
import itertools

### connect to mongodb

In [2]:
connection = MongoClient('localhost', 27017)
db = connection.TwitterStream
db.tweets.ensure_index("id", unique=True, dropDups=True)
collection = db.nba

  This is separate from the ipykernel package so we can avoid doing imports until


### Overriding stream Listener class

In [3]:
#override tweepy.StreamListener to add logic to on_status
class MyStreamListener(tweepy.StreamListener):

    def on_data(self, data):
        
        # Load the Tweet into the variable t
        tweet = json.loads(data)

        # Save the data to mongodb
        collection.save(tweet)

        return True

    def on_error(self, status_code):
        if status_code == 420:
        #returning False in on_error disconnects the stream
            return False

### Authentication

In [4]:
app_key = "Ul47R28vil4FT307t0HcmYsPh"
app_secret = "ItL7fT9UyK94LLRTjgq8PZcsy7CKcbcf28j7qsm8IBwRKKHJz2"

access_token = "4860290417-8tvGbMQduk2x0rLZoqdwcM1pWK0tiI2QO1oqx1d"
access_token_secret = "d3AsnWVi70I2NNS5hK9FXhZPIyza8cUxOXi8vSvUaxvtn"

auth = tweepy.OAuthHandler(app_key, app_secret)
auth.set_access_token(access_token,access_token_secret)

api = tweepy.API(auth)

try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


### call stream listener to collect data

In [None]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener,tweet_mode='extended')

myStream.filter(track=['NBA','Atlanta Hawks','Atlanta Hawks','Brooklyn Nets',
                      'Charlotte Hornets','Chicago Bulls','Cleveland Cavaliers',
                      'Dallas Mavericks','Denver Nuggets','Detroit Pistons',
                      'Golden State Warriors','GSW','Houston Rockets',
                      'Indiana Pacers','LA Clippers','Los Angeles Clippers',
                      'Los Angeles Lakers','LA Lakers','Memphis Grizzlies',
                      'Miami Heat','Milwaukee Bucks','Minnesota Timberwolves',
                      'New Orleans Pelicans','New York Knicks','NY Knicks',
                      'Oklahoma City Thunder','OKC','Orlando Magic',
                      'Philadelphia 76ers','76ers','Phoenix Suns',
                      'Portland Trail Blazers','Sacramento Kings',
                      'San Antonio Spurs','Toronto Raptors','Utah Jazz',
                      'Washington Wizards'])


### enrich stream data with REST API data, probing tweets from NBA teams, most popular NBA players and NBA itself

In [184]:
accounts = ['NBA','kawhileonard','KingJames','JHarden13','StephenCurry30','AntDavis23','Yg_Trece',
           'JoelEmbiid','russwest44','KDTrey5','Dame_Lillard']
tweets = list()
for acc in accounts:
    for status in tweepy.Cursor(api.user_timeline, id=acc).items(500):
        tweets.append(status)

In [187]:
for i in range(len(tweets)):
    collection.save(tweets[i]._json)
print(len(tweets))

  


5004


### inspecting the most mentioned/retweeted/replied to usernames 

In [5]:
usernames = list()

for tweet in collection.find():
    if tweet['in_reply_to_screen_name'] is not None:
        usernames.append(tweet['in_reply_to_screen_name'])
    try: 
        usernames.append(tweet['retweeted_status']['user']['screen_name'])
    except:
        pass
    try: 
        usernames.append(tweet['quoted_status']['user']['screen_name'])
    except:
        pass
    try:
        for user in tweet['entities']['user_mentions']:
            usernames.append(user['screen_name'])
    except:
        pass

In [6]:
frame = pd.DataFrame(usernames,columns=['username'])

In [7]:
frame['username'].value_counts()

J_UNDFTD           10725
NBA                 2028
Ballislife          1654
DimesFromRose       1105
BleacherReport       452
                   ...  
ALeague                1
EcopeSports            1
HoopBallFantasy        1
Theis_Fanclub          1
Penya1930              1
Name: username, Length: 4557, dtype: int64

In [8]:
hashtags = list()

for tweet in collection.find():
    try:
        for hashtag in tweet['entities']['hashtags']:
            hashtags.append(hashtag['text'])
    except:
        pass
hashes = pd.DataFrame(hashtags,columns=['hashtags'])
hashes['hashtags'].value_counts()

NBA                       1339
nba                        146
PS4live                    125
NBATwitter                 121
DFS                         88
                          ... 
TeamZMP                      1
LosAngeles                   1
getpolished                  1
forbes                       1
Top25CollegeBasketball       1
Name: hashtags, Length: 1419, dtype: int64

1. cluster tweets based on text and time posted to separate tweets into 
2. find important users
3. create network of user interactions

### get tweets from database that are not retweets for clustering, also use english tweets only

In [9]:
clustering_tweets = list()

for tweet in collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' }):
    clustering_tweets.append(tweet)

In [10]:
len(clustering_tweets)

5273

### extract text, timestamp and userid from tweets to use for clustering

In [11]:
texts = list()
timestamps = list()
userids = list()

for tweet in clustering_tweets:
    texts.append(tweet['text'])
    timestamps.append(tweet['timestamp_ms'])
    userids.append(tweet['user']['id'])

In [12]:
clustering_data = pd.DataFrame({'text':texts,'timestamp':timestamps,'user_id':userids})

In [13]:
clustering_data.head()

Unnamed: 0,text,timestamp,user_id
0,One of my favorite @NBA players growing up @re...,1583358796097,2790755710
1,Move so garb it’s finally started going in 😂😂😂,1583358800328,1085858856821960704
2,"Not gonna lie, I want LaMelo in the NBA",1583358801011,1034019455721000961
3,@BlackPressureC1 @MjsGoat Bench pressed over 5...,1583358802651,1056680460103925760
4,@NBABet5 @PuntsHeister @banks_220 @Geebee1st @...,1583358803553,337388238


### use only text for clustering

In [14]:
clustering_text = clustering_data['text']

In [15]:
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,2))

In [16]:
vectorizer.fit(clustering_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [17]:
text_matrix = vectorizer.transform(clustering_text)

In [18]:
clusterizer = KMeans(n_clusters=300)
clusterizer.fit(text_matrix)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=300, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [19]:
clustering_data['clusters'] = clusterizer.labels_

In [29]:
clustering_data['clusters'].value_counts().head(10)

153    4386
195      91
5        76
37       51
27       18
11       15
6        15
173      15
52       14
227      11
Name: clusters, dtype: int64

### inspecting individual clusters

In [22]:
clustering_data[clustering_data['clusters']==153].head()

Unnamed: 0,text,timestamp,user_id,clusters
0,One of my favorite @NBA players growing up @re...,1583358796097,2790755710,153
1,Move so garb it’s finally started going in 😂😂😂,1583358800328,1085858856821960704,153
2,"Not gonna lie, I want LaMelo in the NBA",1583358801011,1034019455721000961,153
3,@BlackPressureC1 @MjsGoat Bench pressed over 5...,1583358802651,1056680460103925760,153
4,@NBABet5 @PuntsHeister @banks_220 @Geebee1st @...,1583358803553,337388238,153


In [23]:
clustering_data[clustering_data['clusters']==195].head()

Unnamed: 0,text,timestamp,user_id,clusters
20,Check out my broadcast from my PlayStation 4! ...,1583358830114,1117204447325765632,195
43,Check out my broadcast from my PlayStation 4! ...,1583359296357,796215554885357569,195
82,Check out my broadcast from my PlayStation 4! ...,1583359342559,1085628089915330560,195
97,Check out my broadcast from my PlayStation 4! ...,1583359369714,796215554885357569,195
114,Check out my broadcast from my PlayStation 4! ...,1583359399030,38917224,195


In [24]:
clustering_data[clustering_data['clusters']==5].head()

Unnamed: 0,text,timestamp,user_id,clusters
73,"Tony has 3, SSX tricky or NBA street 2",1583359333951,619335443,5
79,NBA Street Vol 2 and GTA Vice City,1583359340511,127573027,5
88,Top 5:\nBully\nSan Andreas \nNBA Street vol 2\...,1583359351731,1668072798,5
338,"Midnight Club, NFL &amp; NBA Street, TH Pro Sk...",1583359650977,576821141,5
384,nba street @Tye_birdie,1583359701018,751957017716203520,5


In [25]:
clustering_data[clustering_data['clusters']==37].head()

Unnamed: 0,text,timestamp,user_id,clusters
145,NBA YoungBoy Drum Kit - Young GOAT - YoungBoy ...,1583359423975,131835364,37
325,NBA YoungBoy shit b hard but it just be like h...,1583359625396,1666447922,37
332,nba youngboy fans getting viruses after trying...,1583359634993,1118988741190782976,37
348,After she let NBA YoungBoy fuck.,1583359663356,152257299,37
530,"I can't decide who's more garbage between him,...",1583359879031,1147085976,37


In [26]:
clustering_data[clustering_data['clusters']==27].head()

Unnamed: 0,text,timestamp,user_id,clusters
118,Weak ass 😂,1583359402498,572187930,27
275,so weak 🤦🏻‍♂️😂,1583359554961,295243397,27
308,Move is not weak 😂😂😂,1583359604440,333173663,27
688,"If he killin dem with it, then how is it a wea...",1583360082642,993842079938818049,27
1247,😂😂 not weak ass,1583360750817,246461192,27


In [30]:
clustering_data[clustering_data['clusters']==11].head()

Unnamed: 0,text,timestamp,user_id,clusters
721,"Start Kawhi, bench Bron",1583360117434,266699883,11
933,Cut Giannis too easy,1583360378672,388337198,11
1182,"Easy. Start Bron, bench Kawhi cut Giannis",1583360651518,3248075424,11
1423,"Start Bron, bench Kahwi, cut Giannis. I'm sorry.",1583360994255,496796977,11
1481,"This easy, start Bron, Bench Bron, Cut everyon...",1583361042184,1180601594,11


In [31]:
clustering_data[clustering_data['clusters']==6].head()

Unnamed: 0,text,timestamp,user_id,clusters
395,On god,1583359714525,2500554718,6
538,On god😂😂😂😂,1583359889779,392490047,6
644,😂😂😂on god,1583360027658,1146101247163998208,6
907,Wash God,1583360356556,24348934,6
1405,Bron=God,1583360969017,965747156,6


In [28]:
clustering_data['clusters'].value_counts().describe()

count     300.000000
mean       17.576667
std       253.168016
min         1.000000
25%         1.000000
50%         2.000000
75%         2.000000
max      4386.000000
Name: clusters, dtype: float64

### developing user interaction network

In [82]:
#find users with highest number of mentions
tweets = collection.find()
mentions = list()

for tweet in tweets:
    try:
        for user in tweet['entities']['user_mentions']:
            mentions.append(user['screen_name'])
    except:
        pass

mentions = pd.DataFrame(mentions,columns=['mentioned'])

mentions['mentioned'].value_counts().head(10)

J_UNDFTD          4721
NBA               1247
Ballislife         721
DimesFromRose      537
espn               208
BleacherReport     206
KingJames          185
PelicansNBA        144
warriors           143
JSJZBEY            143
Name: mentioned, dtype: int64

In [83]:
#find most retweeted users
tweets = collection.find()
retweets = list()

for tweet in tweets:
    try: 
        retweets.append(tweet['retweeted_status']['user']['screen_name'])
    except:
        pass

retweets = pd.DataFrame(retweets,columns=['retweeted'])

retweets['retweeted'].value_counts().head(10)

J_UNDFTD          4712
Ballislife         718
NBA                555
DimesFromRose      534
BleacherReport     195
espn               176
JSJZBEY            143
warriors           119
alper_forza         88
A_S12               86
Name: retweeted, dtype: int64

In [84]:
#find most replied to users
tweets = collection.find()
replies = list()

for tweet in tweets:
    if tweet['in_reply_to_screen_name'] is not None:
        replies.append(tweet['in_reply_to_screen_name'])

replies = pd.DataFrame(replies,columns=['replied'])

replies['replied'].value_counts().head(10)

NBA                93
barstoolsports     12
OperationSports    11
Bucks              11
Lakers              8
espn                8
Lillard_SPFC        7
undisputed          7
nbastats            7
EASPORTSNBALM       7
Name: replied, dtype: int64

In [85]:
#find most quoted users
tweets = collection.find()
quoted = list()

for tweet in tweets:
    try: 
        quoted.append(tweet['quoted_status']['user']['screen_name'])
    except:
        pass

quoted = pd.DataFrame(quoted,columns=['quoted'])

quoted['quoted'].value_counts().head(10)

J_UNDFTD          1292
Ballislife         213
NBA                133
HLNinEngeland      128
ESPNNBA            127
wojespn             78
BenStinar           51
AndrewDBailey       50
BleacherReport      46
sny_knicks          46
Name: quoted, dtype: int64

### generate edges for graph with python counter

In [86]:
edges = Counter()

In [87]:
for tweet in collection.find():
    user = tweet['user']['screen_name']
    
    if tweet['in_reply_to_screen_name'] is not None:
        edges[tuple([user,tweet['in_reply_to_screen_name']])]+=1
    try: 
        edges[tuple([user,tweet['retweeted_status']['user']['screen_name']])]+=1
    except:
        pass
    try: 
        edges[tuple([user,tweet['quoted_status']['user']['screen_name']])]+=1
    except:
        pass
    try:
        for mention in tweet['entities']['user_mentions']:
            edges[tuple([user,mention['screen_name']])]+=1
    except:
        pass

In [88]:
edges.most_common()

[(('bgeLYDsmCr4acLf', 'NBA'), 52),
 (('Mell1453', 'NBA'), 30),
 (('SethMadeNBA', 'NBA'), 20),
 (('canteiro_hugo', 'NBA'), 20),
 (('sultanabdumalik', 'iyaboawokoya'), 20),
 (('James46493', 'NBALatam'), 18),
 (('Flowgawd', 'NBA'), 18),
 (('Ste_Gualdoni', 'NBA'), 18),
 (('emre30680967', 'magnesei'), 16),
 (('SGDNation', 'SGDNation'), 14),
 (('mister_agyenkwa', 'NBA'), 14),
 (('falatah_waleed', 'NBA'), 14),
 (('Lillard_SPFC', 'Lillard_SPFC'), 13),
 (('sportsAllnews_', 'NBA'), 13),
 (('iAmTerrace', 'iAmTerrace'), 12),
 (('AwesemoNBA', 'AwesemoNBA'), 12),
 (('tvselm', 'Dee_Black_MMA_'), 12),
 (('WJB_27', 'OnTheRotoRadar'), 11),
 (('Homacityjosh', 'Homacityjosh'), 10),
 (('freebagfamily', 'JimmyTheBag'), 10),
 (('BrandonDoooooo', 'BrandonDoooooo'), 10),
 (('Tom_RR_', 'OnTheRotoRadar'), 10),
 (('dfsgustii', 'OnTheRotoRadar'), 10),
 (('FlashPugUK', 'SportsWatch1'), 10),
 (('KidBloggerJosh', 'courtsideheat'), 10),
 (('hillshit_x2', 'hillshit2'), 9),
 (('Underarmour3030', 'cavs'), 8),
 (('LouissB

### generate graph to analyze network measures

In [89]:
general_network = nx.DiGraph()

In [90]:
for edge in edges:
    general_network.add_edge(edge[0],edge[1],weight=edges[edge])

{'003': 1315775380986,
 '012': 409911012,
 '021C': 7201,
 '021D': 11512,
 '021U': 18765830,
 '030C': 0,
 '030T': 1136,
 '102': 3105013,
 '111D': 876,
 '111U': 149,
 '120C': 1,
 '120D': 79,
 '120U': 118,
 '201': 75,
 '210': 9,
 '300': 8}

### generate hashtag coocurence graph for general data

##### find most used hashtags 

In [147]:
hashtags = list()

for tweet in collection.find():
    try:
        for hashtag in tweet['entities']['hashtags']:
            hashtags.append(hashtag['text'])
    except:
        pass
hashes = pd.DataFrame(hashtags,columns=['hashtags'])
hashes['hashtags'].value_counts().head(10)

NBA            1339
nba             146
PS4live         125
NBATwitter      121
DFS              88
NBAjp            83
꽁머니              66
TeamParieur      63
PhantomCam       59
Celtics          54
Name: hashtags, dtype: int64

##### generate tuples of coocuring hashtags

In [166]:
general_hashtags = Counter()

In [167]:
for tweet in collection.find():
    try:
        for pair in itertools.combinations(tweet['entities']['hashtags'],2):
            general_hashtags[frozenset([pair[0]['text'],pair[1]['text']])]+=1
    except:
        pass

In [168]:
general_hashtags.most_common()

[(frozenset({'NBA', 'NBATwitter'}), 109),
 (frozenset({'DFS', 'NBA'}), 85),
 (frozenset({'NBA', 'NBAjp'}), 83),
 (frozenset({'꽁머니', '프로토'}), 60),
 (frozenset({'가입꽁머니', '꽁머니'}), 60),
 (frozenset({'꽁머니', '꽁머니홍보'}), 60),
 (frozenset({'꽁머니', '꽁머니공유사이'}), 60),
 (frozenset({'가입꽁1만', '꽁머니'}), 60),
 (frozenset({'꽁머니', '메이저'}), 60),
 (frozenset({'꽁머니', '스포츠픽'}), 60),
 (frozenset({'꽁머니', '매일지급되는가족방2만'}), 60),
 (frozenset({'NBA', 'TeamParieur'}), 41),
 (frozenset({'GrindCity', 'NBA'}), 40),
 (frozenset({'Basketball', 'NBA'}), 34),
 (frozenset({'가입꽁머니', '프로토'}), 30),
 (frozenset({'꽁머니홍보', '프로토'}), 30),
 (frozenset({'꽁머니공유사이', '프로토'}), 30),
 (frozenset({'가입꽁1만', '프로토'}), 30),
 (frozenset({'메이저', '프로토'}), 30),
 (frozenset({'스포츠픽', '프로토'}), 30),
 (frozenset({'매일지급되는가족방2만', '프로토'}), 30),
 (frozenset({'가입꽁머니', '꽁머니홍보'}), 30),
 (frozenset({'가입꽁머니', '꽁머니공유사이'}), 30),
 (frozenset({'가입꽁1만', '가입꽁머니'}), 30),
 (frozenset({'가입꽁머니', '메이저'}), 30),
 (frozenset({'가입꽁머니', '스포츠픽'}), 30),
 (frozenset({'가입꽁머니', '매일지급되

### generate user interaction network for main cluster

In [92]:
#find users with highest number of mentions
tweets = collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' })
mentions = list()

for tweet,cluster in zip(tweets,clusterizer.labels_):
    #only consider tweets in main cluster
    if(cluster==153):
        try:
            for user in tweet['entities']['user_mentions']:
                mentions.append(user['screen_name'])
        except:
            pass

mentions = pd.DataFrame(mentions,columns=['mentioned'])

mentions['mentioned'].value_counts().head(10)

NBA              161
Bucks             29
YouTube           29
KingJames         20
nyknicks          17
espn              16
dallasmavs        16
PelicansNBA       14
unitedmasters     13
Lakers            13
Name: mentioned, dtype: int64

In [93]:
#find most retweeted users
#since retweets were removed in clustering there are no retweets in the cluster and therefore we cant find users
#with most retweets since there arent any

In [94]:
#find most replied to users
tweets = collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' })
replies = list()

for tweet,cluster in zip(tweets,clusterizer.labels_):
    if(cluster==153):
        if tweet['in_reply_to_screen_name'] is not None:
            replies.append(tweet['in_reply_to_screen_name'])

replies = pd.DataFrame(replies,columns=['replied'])

replies['replied'].value_counts().head(10)

NBA                48
EASPORTSNBALM       7
barstoolsports      6
undisputed          6
BleacherReport      5
espn                5
Bucks               4
tonyfinaugolf       4
Chris_Broussard     4
JCMacriNBA          3
Name: replied, dtype: int64

In [95]:
#find most quoted users
tweets = collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' })
quoted = list()

for tweet,cluster in zip(tweets,clusterizer.labels_):
    if(cluster == 153):
        try: 
            quoted.append(tweet['quoted_status']['user']['screen_name'])
        except:
            pass

quoted = pd.DataFrame(quoted,columns=['quoted'])

quoted['quoted'].value_counts().head(10)

J_UNDFTD          696
Ballislife         81
ESPNNBA            55
NBA                36
wojespn            28
BleacherReport     23
JosephhT12         23
undisputed         21
DimesFromRose      20
NYG_4_LIFE         19
Name: quoted, dtype: int64

### generate edges for graph with python counter

In [96]:
edges = Counter()

In [97]:
for tweet,cluster in zip(collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' }),clusterizer.labels_):
    if (cluster == 153):
        user = tweet['user']['screen_name']

        if tweet['in_reply_to_screen_name'] is not None:
            edges[tuple([user,tweet['in_reply_to_screen_name']])]+=1
        try: 
            edges[tuple([user,tweet['retweeted_status']['user']['screen_name']])]+=1
        except:
            pass
        try: 
            edges[tuple([user,tweet['quoted_status']['user']['screen_name']])]+=1
        except:
            pass
        try:
            for mention in tweet['entities']['user_mentions']:
                edges[tuple([user,mention['screen_name']])]+=1
        except:
            pass

In [98]:
edges.most_common()

[(('redsmith1', 'TheHerd'), 6),
 (('bettingresource', 'dquanpicks'), 6),
 (('tmac36daexpert', 'cboyd304'), 6),
 (('Almightyspace', '_lamar3'), 6),
 (('Lonzoyeng3', 'NBA'), 5),
 (('MattWithAMouth', 'FinkMark1'), 5),
 (('NotAirid', '_rockrob'), 5),
 (('MrNBA16', 'NBA'), 4),
 (('Bevlin19', 'madewitlean'), 4),
 (('Bevlin19', 'RandomHopJunkie'), 4),
 (('edb288', 'cparrottSQUAWK'), 4),
 (('PhillyAnimal6', 'kornhauser22'), 4),
 (('ClataroJames', 'kvng_matt29'), 4),
 (('KoalaSwagBamboo', 'NBA'), 4),
 (('MtbLawson', 'vinnythibeault'), 4),
 (('MtbLawson', 'antonin_org'), 4),
 (('BigX84', 'NBA'), 4),
 (('LuanwonderII', 'betchanger'), 4),
 (('cboyd304', 'tmac36daexpert'), 4),
 (('TysonMurdockt', 'locher27'), 4),
 (('AndrewMartinBet', 'BackAftaThis'), 4),
 (('DarrenAbenstein', 'ZackTeitel'), 4),
 (('Suebird00565589', 'KUHoops'), 4),
 (('NLLTweets', 'DUNOTS'), 4),
 (('MattWithAMouth', 'seanqatnight'), 4),
 (('ultimatum_shop', 'NBA'), 4),
 (('tjbey2000', 'Chris_Broussard'), 4),
 (('ruel_4745', 'undis

### generate graph to analyze network measures

In [99]:
cluster_network = nx.DiGraph()

In [100]:
for edge in edges:
    cluster_network.add_edge(edge[0],edge[1],weight=edges[edge])

In [102]:
nx.triadic_census(cluster_network)

{'003': 13664612225,
 '012': 15985466,
 '021C': 876,
 '021D': 2407,
 '021U': 270157,
 '030C': 0,
 '030T': 88,
 '102': 130251,
 '111D': 24,
 '111U': 43,
 '120C': 0,
 '120D': 2,
 '120U': 22,
 '201': 4,
 '210': 0,
 '300': 0}

### generate hashtag coocurence graph for main cluster data

##### find most used hashtags 

In [104]:
hashtags = list()

for tweet,cluster in zip(collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' }),clusterizer.labels_):
    if(cluster==153):
        try:
            for hashtag in tweet['entities']['hashtags']:
                hashtags.append(hashtag['text'])
        except:
            pass
hashes = pd.DataFrame(hashtags,columns=['hashtags'])
hashes['hashtags'].value_counts()

NBA             247
nba              56
DFS              30
NBATwitter       29
basketball       20
               ... 
SenseiSam         1
sneakerheads      1
SelltheTeam       1
inplay            1
LiangeloBall      1
Name: hashtags, Length: 635, dtype: int64

##### generate tuples of coocuring hashtags

In [163]:
cluster_hashtags = Counter()

In [164]:
for tweet,cluster in zip(collection.find({ "retweeted_status" : { "$exists" : False },'lang':'en' }),clusterizer.labels_):
    if(cluster==153):
        try:
            for pair in itertools.combinations(tweet['entities']['hashtags'],2):
                cluster_hashtags[frozenset([pair[0]['text'],pair[1]['text']])]+=1
        except:
            pass

In [165]:
cluster_hashtags.most_common()

[(frozenset({'DFS', 'NBA'}), 28),
 (frozenset({'NBA', 'NBATwitter'}), 24),
 (frozenset({'Celtics', 'NBA'}), 11),
 (frozenset({'expertpicks', 'guaranteedpick'}), 8),
 (frozenset({'NBA', 'basketball'}), 8),
 (frozenset({'basketball', 'nba'}), 7),
 (frozenset({'GrindCity', 'NBA'}), 6),
 (frozenset({'Basketball', 'NBA'}), 6),
 (frozenset({'NBA', 'NHL'}), 6),
 (frozenset({'nba', 'nhl'}), 6),
 (frozenset({'Fanduel', 'NBA'}), 6),
 (frozenset({'feedly', 'sports'}), 6),
 (frozenset({'BasketballCards', 'NBA'}), 5),
 (frozenset({'NBA', 'WeGoHard'}), 5),
 (frozenset({'DraftKings', 'NBA'}), 5),
 (frozenset({'BeTheFight', 'NBA'}), 5),
 (frozenset({'DFS', 'Fanduel'}), 5),
 (frozenset({'MFFL', 'NBA'}), 5),
 (frozenset({'nba', 'nfl'}), 5),
 (frozenset({'Bucks', 'Pacers'}), 5),
 (frozenset({'CBB', 'NBA'}), 5),
 (frozenset({'NBA', 'TeamParieur'}), 4),
 (frozenset({'Free', 'NBA'}), 4),
 (frozenset({'Expert', 'NBA'}), 4),
 (frozenset({'Betting', 'NBA'}), 4),
 (frozenset({'NBA', 'Odds'}), 4),
 (frozenset({'

### code for generating csv sample data

In [None]:
data_sample = collection.aggregate([{'$sample':{'size':100}}])
data_sample = list(data_sample)
data_sample = pd.DataFrame(data_sample)
data_sample.to_csv('sample.csv')