In [1]:
import sklearn
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import random

In [2]:
%%time
suspect_timelines = pickle.load(open("sus_timelines_pt1.p", "rb"))

Wall time: 10.2 s


In [3]:
len(suspect_timelines)

1420

In [4]:
users_to_remove = []
for u in suspect_timelines:
    if len(suspect_timelines[u]) <3:
        users_to_remove.append(u)
        
for u in users_to_remove:
    del suspect_timelines[u]
        
print(len(suspect_timelines))

1394


In [5]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
def get_domain(u):
    endmarker = u[8:].find("/")
    if endmarker == -1:
        return u
    else:
        return u[0:8+endmarker]

def get_links_domains_count_for_user(tweets):
    links = defaultdict(lambda : 0)
    domains = defaultdict(lambda : 0)
    for t in tweets:
        if len(t[5]) == 0:
            continue
        else:
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                domain = get_domain(url)
                links[url] += 1
                domains[domain] += 1
    return(links, domains)


def get_domain_counts_for_users(timelines):
    domains = dict()
    for user, tweets in timelines.items():
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        domain_counts = [(d, c) for d,c in userdomains.items()]
        domains[user] = domain_counts
    return domains
        


def get_links_domains_for_users(sets_of_tweets):
    domains = defaultdict(lambda: [])
    links = defaultdict(lambda: [])
    for tweets in sets_of_tweets:
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        for l in userlinks:
            links[l].append(userlinks[l])
        for d in userdomains:
            domains[d].append(userdomains[d])
    return (links, domains)    
    

def get_links_domains_associated_words(sets_of_tweets):
    links = defaultdict(lambda : defaultdict(lambda: 0))
    domains = defaultdict(lambda: defaultdict(lambda: 0))
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                continue
            text = t[4].split()
            text_filtered = []
            for word in text:
                if not word.startswith('http') and not word.lower() in stop and len(word)>1:
                    text_filtered.append(word)
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                for w in text_filtered:
                    links[url][w] +=1
                domain = get_domain(url)
                for w in text_filtered:
                    domains[domain][w] +=1
                
    links_to_word_count = dict()
    for k, d in links.items():
        word_counts = []
        for w,c in d.items():
            word_counts.append((w, c))
        links_to_word_count[k] = word_counts
      
    domains_to_word_count = dict()
    for k, d in domains.items():
        word_counts = []
        for w, c in d.items():
            word_counts.append((w,c))
        domains_to_word_count[k] = word_counts
    
    return links_to_word_count, domains_to_word_count

                

def characterize_links_by_comment(sets_of_tweets):
    url_count = 0
    nourl_count = 0
    uncommented = 0
    commented_light = []
    commented_heavy = []
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                nourl_count += 1
            else:
                url_count +=  1
                text = t[4].split()
                words = [w.strip() for w in text if not w.startswith("http")]
                where_to_append = ""
                if len(words) == 0:
                    uncommented += 1
                    continue
                elif len(words) <= 4:
                    where_to_append = commented_light
                else: #len(words)>4
                    where_to_append =commented_heavy
                reconstrcuted_tweet = " ".join(words)
                for url in t[5]:
                    reconstrcuted_tweet += " " + url
                where_to_append.append(reconstrcuted_tweet)
    totalnumber = nourl_count + url_count
    print("Total number of tweets: ", totalnumber)
    print("Tweets without urls: ", nourl_count, " = ", (nourl_count/totalnumber)*100, "%")
    print("Tweets with urls: ", url_count, " = ", (url_count/totalnumber)*100, "%")
    print("URL tweets containing no comment: ", uncommented, " = ", str((uncommented/url_count)*100), "%")
    print("URL tweets containing 4 words or fewer: ", len(commented_light), " = ", (len(commented_light)/url_count)*100, "%")
    print("URL tweets containing more than 4 words: ", len(commented_heavy), " = ", (len(commented_heavy)/url_count)*100, "%")
    return commented_light, commented_heavy


In [7]:
%%time
links, domains = get_links_domains_for_users(suspect_timelines.values())
print("count of links found: ", len(links))
print("count of domains found: ", len(domains))

count of links found:  1812550
count of domains found:  52566
Wall time: 6.32 s


In [8]:
sorted_links = sorted([x for x in links.items()], key= lambda x: len(x[1]), reverse=True)
for l in sorted_links[0:100]:
    print("\n", l[0], "\nCount of users: ", str(len(l[1])), " -- Total occurances: ", str(sum(l[1])))


 http://USFREEDOMARMY.COM 
Count of users:  232  -- Total occurances:  1894

 https://truepundit.com/exclusive-six-u-s-agencies-conspired-to-illegally-wiretap-trump-british-intel-used-as-front-to-spy-on-campaign-for-nsa/ 
Count of users:  218  -- Total occurances:  369

 http://www.foxnews.com/politics/2018/02/07/more-texts-between-strzok-and-page-uncovered-lead-to-more-questions.html 
Count of users:  212  -- Total occurances:  365

 http://bigliebook.com 
Count of users:  211  -- Total occurances:  335

 http://jwatch.us/3Savgi 
Count of users:  209  -- Total occurances:  346

 http://jwatch.us/8zpFUN 
Count of users:  198  -- Total occurances:  303

 https://twitter.com/comey/status/959498570532577285 
Count of users:  183  -- Total occurances:  297

 http://jwatch.us/BMQ6hK 
Count of users:  183  -- Total occurances:  303

 http://thehill.com/homenews/administration/372861-uranium-one-informant-makes-clinton-allegations-in-testimony 
Count of users:  176  -- Total occurances:  262

In [9]:
sorted_domains = sorted([x for x in domains.items()], key= lambda x: len(x[1]), reverse=True)
for d in sorted_domains[0:100]:
    print(d[0], "\tCount of users: ", str(len(d[1])), " -- Total occurances: ", str(sum(d[1])))

https://twitter.com 	Count of users:  1238  -- Total occurances:  326762
https://youtu.be 	Count of users:  1182  -- Total occurances:  80719
http://bit.ly 	Count of users:  1164  -- Total occurances:  60403
https://www.youtube.com 	Count of users:  1138  -- Total occurances:  32160
https://fb.me 	Count of users:  922  -- Total occurances:  225228
http://www.foxnews.com 	Count of users:  919  -- Total occurances:  15502
http://www.breitbart.com 	Count of users:  911  -- Total occurances:  21731
http://dailycaller.com 	Count of users:  904  -- Total occurances:  17235
http://www.thegatewaypundit.com 	Count of users:  897  -- Total occurances:  35587
https://goo.gl 	Count of users:  891  -- Total occurances:  34498
http://thehill.com 	Count of users:  887  -- Total occurances:  7976
http://fxn.ws 	Count of users:  877  -- Total occurances:  12239
http://ow.ly 	Count of users:  869  -- Total occurances:  25602
https://www.facebook.com 	Count of users:  846  -- Total occurances:  6483
http

In [10]:
sorted_domains = sorted([x for x in domains.items()], key= lambda x: len(x[1]), reverse=True)
for d in sorted_domains[500:600]:
    print(d[0], "\tCount of users: ", str(len(d[1])), " -- Total occurances: ", str(sum(d[1])))

http://www.americanpatriotdaily.com 	Count of users:  79  -- Total occurances:  637
https://www.prageru.com 	Count of users:  79  -- Total occurances:  173
https://www.libertyheadlines.com 	Count of users:  79  -- Total occurances:  464
http://www.pacificpundit.com 	Count of users:  79  -- Total occurances:  382
https://aim4truth.org 	Count of users:  79  -- Total occurances:  210
http://www.wsj.com 	Count of users:  79  -- Total occurances:  102
https://talkingpointsmemo.com 	Count of users:  78  -- Total occurances:  141
http://ref.gl 	Count of users:  78  -- Total occurances:  206
http://constitution.com 	Count of users:  78  -- Total occurances:  1196
https://assets.documentcloud.org 	Count of users:  78  -- Total occurances:  95
https://af-mg.com 	Count of users:  78  -- Total occurances:  106
http://www.truthrevolt.org 	Count of users:  77  -- Total occurances:  444
https://docs.google.com 	Count of users:  77  -- Total occurances:  112
http://rightwingnews.com 	Count of users:  

In [11]:
%%time
dc = get_domain_counts_for_users(suspect_timelines)

Wall time: 3.33 s


In [12]:
users_sorted_by_domain_count = sorted([x for x in dc.items()], key=lambda x: len(x[1]), reverse=True)

In [13]:
rand_user = users_sorted_by_domain_count[-10]
print("username: ", rand_user[0])
print(len(rand_user[1]), "domains")
print("\n".join([str(x) for x in sorted(rand_user[1], key= lambda x: x[1], reverse=True)]))

username:  JockyBoyes
3 domains
('http://www.palmerreport.com', 2)
('http://abcn.ws', 1)
('https://goo.gl', 1)


In [14]:
rand_user = random.choices(population=[x for x in dc.items()], k=1)[0]
print("username: ", rand_user[0])
print(len(rand_user[1]), "domains")
print("\n".join([str(x) for x in sorted(rand_user[1], key= lambda x: x[1], reverse=True)]))

username:  koran9999
298 domains
('https://twitter.com', 208)
('http://bit.ly', 141)
('http://www.palmerreport.com', 131)
('https://trofire.com', 102)
('http://fb.me', 97)
('https://fb.me', 52)
('https://www.youtube.com', 46)
('http://ow.ly', 38)
('https://shareblue.com', 38)
('https://youtu.be', 37)
('http://www.politicususa.com', 37)
('http://2013rainbowroundtable.ning.com', 36)
('https://www.rt.com', 33)
('https://www.dailykos.com', 31)
('https://www.democracynow.org', 30)
('http://www.disclose.tv', 28)
('http://nyccats.urgentpodr.org', 27)
('https://bbsradio.com', 24)
('https://buff.ly', 22)
('http://youtu.be', 16)
('http://www.spaceweather.com', 16)
('http://www.ashtarontheroad.com', 15)
('https://www.rawstory.com', 15)
('https://interc.pt', 14)
('https://sputniknews.com', 14)
('http://icont.ac', 13)
('http://www.ancient-origins.net', 11)
('http://nycdogs.urgentpodr.org', 11)
('https://www.disclose.tv', 10)
('https://www.facebook.com', 8)
('https://www.pscp.tv', 8)
('https://www.w

In [15]:
%%time
light, heavy = characterize_links_by_comment(suspect_timelines.values())

Total number of tweets:  3655718
Tweets without urls:  1290351  =  35.296787115417544 %
Tweets with urls:  2365367  =  64.70321288458246 %
URL tweets containing no comment:  324575  =  13.721972108345131 %
URL tweets containing 4 words or fewer:  174253  =  7.36684835799265 %
URL tweets containing more than 4 words:  1866539  =  78.91117953366222 %
Wall time: 14.2 s


In [16]:
print("\n\n".join(random.choices(population=light, k=15)))

Cool. Redos https://www.invaluable.com/blog/famous-artists-movie-posters/

#ReleaseTheMemo #NotABot http://www.dailymail.co.uk/news/article-5302947/FBI-officials-texted-secret-society-Trump-won.html

Shocker Another Democrat hypocrite https://twitter.com/seanhannity/status/962101525798641665

About time! @FoxNews https://www.google.com/amp/www.foxnews.com/opinion/2017/09/10/feinsteins-anti-catholic-bigotry-blasted-by-notre-dame-president.amp.html

@DannyJiminian @realPolitiDiva @realDonaldTrump https://www.google.ca/amp/s/www.yahoo.com/amphtml/sports/liangelo-ball-ucla-teammates-face-3-10-years-prison-convicted-shoplifting-013958780.html

#ReleaseTheMemo #NotABot #NeverVoteDemocratAgain https://twitter.com/Thomas1774Paine/status/956119483655049216

Rational Thinking next level http://fb.me/6KIrOo7uE

#Grenfell Tower lies unravelling" http://tapnewswire.com/2017/07/grenfell-tower-lies-unravelling/

「安倍が倒れた」という。具体的に６月９日深夜から１０日未明にかけて。慶應の主治医らがタクシー４台で私邸に乗り付けて、懸命の治療をしていた。https://t.co/D1RfBv4

In [17]:
print("\n\n".join(random.choices(population=heavy, k=15)))

"Police Chief Warns ‘London is Modern Slavery Hotspot’, 13K Victims Forced into Labor and Sex" http://tapnewswire.com/2017/07/police-chief-warns-london-is-modern-slavery-hotspot-13k-victims-forced-into-labor-and-sex/

"The tone from the top of the Trump administration has unerringly been that women are to be cherished and protected right up until the moment they stop being docile and decorative, and then they are to be dismissed and humiliated." https://twitter.com/rickhasen/status/961762070860623872

Jeb Bush says there will no border wall and Mexico will not pay for it http://bit.ly/2DFQACv

The great thing is that we waited until now to blow a gasket over this, Because Trump! https://twitter.com/Wonkette/status/962397707188015105

VIDEO EXPLOZIV/Soldații americani, ÎNGENUNCHEAȚI de iranieni! prin @comisarulro http://www.comisarul.ro/politic/video-exploziv/soldatii-americani-ingenuncheati-de_672955.html

What an Artificial Intelligence Researcher Fears about AI http://ed.gr/ci4p

Are

In [31]:
def select_dimensions(users, max_cutoff, min_cutoff):
    domains = defaultdict(lambda: 0)
    for ddict in users:
        current_user = ddict[0]
        print("current instance:", current_user)
        for (d, c) in ddict[1]:
            domains[d] +=1
    print("domains above cutoff:\n", [d for d in domains if domains[d]>max_cutoff])
    print("domains below cutoff:\n",  [d for d in domains if domains[d]<min_cutoff])
    selected_domains = sorted([x for x in domains if domains[x]>=min_cutoff and domains[x]<= max_cutoff])
    return selected_domains


def insts_to_vecs(insts, dimensions):
    vecs = []
    for u, udomains in insts:
        print("building vector for user:", u)
        vec = []
        for d in dimensions:
            ud = [x for x in udomains if x[0]==d]
            if ud == []:
                vec.append(0)
            else:
                assert len(ud) == 1
                vec.append(ud[0][1])
        
        vecs.append(vec)
    return vecs        
        

In [19]:
users_set = users_sorted_by_domain_count[200:300]
selected_d = select_dimensions(users_set, 80, 10)
vecs = insts_to_vecs(users_set, selected_d)

current user: chp3871
current user: pamelava10
current user: TruthnotFM
current user: Shasha44373856
current user: rafawkes
current user: kevensets
current user: NanalovesAub
current user: BrendaAnders2
current user: KuhnKat1
current user: 1humanagenda
current user: alvejz_59
current user: Traveldealguy
current user: EtsiSusi
current user: RajivMessage
current user: Patriottechsan
current user: uneedtoknow1
current user: soft63389
current user: AlanVernon3
current user: PALucier
current user: catoletters
current user: whatzmatteryou
current user: CFT1
current user: RAndrewOhge
current user: rugburndaddy
current user: HeidiKole
current user: cosemote333
current user: yorkieartist
current user: Kombinatke
current user: SIMikeSr
current user: BreacherAlec1
current user: Brialalexi
current user: slouise2004
current user: statepsyops
current user: vaxen_var
current user: Kimby2016
current user: kaarent
current user: grannyshrek
current user: Carlton96611744
current user: tomfyffe78
current 

In [22]:
#length of the feature vector
veclen = len(vecs[0])
print("length of features vector: ", veclen)
for v in vecs:
    assert len(v) == veclen

length of features vector:  505


In [23]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(vecs)

In [24]:
c0 = c1 = []
for u, l in zip([x[0] for x in users_set], kmeans.labels_):
    if l == 0:
        c0.append(u)
    elif l == 1:
        c1.append(u)
    else:
        print("Unknown value for cluster: ", l)

print("Size of cluster 0: ", len(c0))
print("Size of cluster 1: ", len(c1))
print("\n*********************\nCluster O:")
print("\n".join(["http://www.twitter.com/" + x for x in c0]))
print("\n*********************\nCluster 1:")
print("\n".join(["http://www.twitter.com/" + x for x in c1]))

Size of cluster 0:  100
Size of cluster 1:  100

*********************
Cluster O:
http://www.twitter.com/chp3871
http://www.twitter.com/pamelava10
http://www.twitter.com/TruthnotFM
http://www.twitter.com/Shasha44373856
http://www.twitter.com/rafawkes
http://www.twitter.com/kevensets
http://www.twitter.com/NanalovesAub
http://www.twitter.com/BrendaAnders2
http://www.twitter.com/KuhnKat1
http://www.twitter.com/1humanagenda
http://www.twitter.com/alvejz_59
http://www.twitter.com/Traveldealguy
http://www.twitter.com/EtsiSusi
http://www.twitter.com/RajivMessage
http://www.twitter.com/Patriottechsan
http://www.twitter.com/uneedtoknow1
http://www.twitter.com/soft63389
http://www.twitter.com/AlanVernon3
http://www.twitter.com/PALucier
http://www.twitter.com/catoletters
http://www.twitter.com/whatzmatteryou
http://www.twitter.com/CFT1
http://www.twitter.com/RAndrewOhge
http://www.twitter.com/rugburndaddy
http://www.twitter.com/HeidiKole
http://www.twitter.com/cosemote333
http://www.twitter.com/

In [25]:
print("\n".join([x for x in selected_d]))

http://100percentfedup.com
http://45.wh.gov
http://DCWhispers.com
http://NaturalNews.com
http://Newsmax.com
http://Support45.com
http://USFREEDOMARMY.COM
http://a.msn.com
http://abc13.com
http://abc7.com
http://abcn.ws
http://abcnews.go.com
http://aclj.us
http://agendaofevil.com
http://aje.io
http://allnewspipeline.com
http://americafans.com
http://americanlookout.com
http://amp.dailycaller.com
http://amp.washingtontimes.com
http://amzn.to
http://aol.it
http://apne.ws
http://archive.is
http://barenakedislam.com
http://bb4sp.com
http://bbc.in
http://beforeitsnews.com
http://bigliebook.com
http://breaking911.com
http://buff.ly
http://canadafreepress.com
http://cbsloc.al
http://cbsn.ws
http://cnb.cx
http://cnn.it
http://cnnmon.ie
http://conservative101.com
http://conservativefighters.com
http://conservativetribune.com
http://crwd.fr
http://dailysign.al
http://dailysignal.com
http://deadline.com
http://dennismichaellynch.com
http://diamondandsilk.breakinginfoalert.com
http://diamondandsilk

In [26]:
%%time
links_words, domains_words = get_links_domains_associated_words(suspect_timelines.values())

Wall time: 5min 5s


In [27]:
sorted(domains_words['http://www.breitbart.com'], key=lambda x: x[1], reverse=True)

[('@BreitbartNews', 2672),
 ('Breitbart', 2591),
 ('Trump', 2533),
 ('via', 2008),
 ('#MAGA', 1011),
 ('&amp;', 983),
 ('#AAG', 731),
 ('FBI', 689),
 ('Democrats', 634),
 ('#feedly', 632),
 ('Illegal', 586),
 ('@realDonaldTrump', 563),
 ('#Patriot', 563),
 ('Donald', 555),
 ('#news', 550),
 ('President', 538),
 ('Obama', 530),
 ('U.S.', 479),
 ('State', 465),
 ('DACA', 464),
 ('Americans', 431),
 ('Clinton', 416),
 ('GOP', 389),
 ('House', 387),
 ('New', 384),
 ('Memo', 376),
 ('Amnesty', 375),
 ('American', 359),
 ('Report:', 337),
 ('Hillary', 328),
 ('like', 324),
 ('White', 322),
 ('John', 321),
 ('News', 320),
 ('Dossier', 311),
 ('Roy', 308),
 ('#ma4t', 308),
 ('Immigration', 303),
 ('Moore', 301),
 ('Trump’s', 298),
 ('Media', 294),
 ('America', 289),
 ("Trump's", 281),
 ('FISA', 269),
 ('Says', 268),
 ('Border', 263),
 ('Aliens', 261),
 ('One', 254),
 ('Bill', 253),
 ('Democrat', 241),
 ('Soros', 237),
 ('get', 235),
 ('Trump:', 229),
 ('Adam', 228),
 ('Congress', 220),
 ('ille

In [33]:
sorted(domains_words['http://fb.me' ], key=lambda x: x[1], reverse=True)

[('Trump', 2899),
 ('de', 2851),
 ('new', 2233),
 ('Facebook', 1989),
 ('posted', 1901),
 ('photo', 1579),
 ('la', 1378),
 ('en', 1174),
 ('Retweeted', 1140),
 ('Obama', 1109),
 ('one', 1072),
 ('&amp;', 1031),
 ('like', 1012),
 ('yang', 1006),
 ('people', 986),
 ('New', 963),
 ('President', 908),
 ('Hillary', 899),
 ('US', 836),
 ('Clinton', 830),
 ('get', 811),
 ('know', 775),
 ('the...', 741),
 ('video', 715),
 ('would', 673),
 ('Muslim', 635),
 ('~JB', 612),
 ('love', 605),
 ('el', 593),
 ('see', 587),
 ('News', 579),
 ('One', 564),
 ('time', 562),
 ("I'm", 557),
 ('FBI', 548),
 ('White', 547),
 ('think', 547),
 ('going', 543),
 ('want', 529),
 ('need', 522),
 ('People', 521),
 ('BREAKING:', 513),
 ('Daily', 508),
 ('American', 480),
 ('go', 471),
 ('del', 455),
 ('Black', 454),
 ('NsromaMedia', 452),
 ('says', 446),
 ('could', 443),
 ('Man', 438),
 ('NFL', 437),
 ('make', 432),
 ('say', 431),
 ('House', 430),
 ('Donald', 426),
 ('really', 421),
 ('us', 419),
 ('Muslims', 418),
 ('

In [31]:
sorted(domains_words['https://youtu.be' ], key=lambda x: x[1], reverse=True)

[('via', 33091),
 ('@YouTube', 32773),
 ('Trump', 2987),
 ('&amp;', 2975),
 ('...', 1421),
 ('FBI', 1172),
 ('Clinton', 1049),
 ('President', 950),
 ('Hillary', 949),
 ('Obama', 931),
 ('New', 899),
 ('2017', 698),
 ('Watch', 674),
 ('@realDonaldTrump', 663),
 ('BREAKING:', 656),
 ('State', 616),
 ('News', 613),
 ('Day', 611),
 ('US', 577),
 ('#QAnon', 553),
 ('FISA', 551),
 ('#MAGA', 527),
 ('CIA', 495),
 ('2018', 487),
 ('TRUMP', 481),
 ('Deep', 471),
 ('Donald', 463),
 ('#pedogate', 458),
 ('Dr.', 449),
 ('video', 440),
 ('BREAKING', 435),
 ('Secret', 435),
 ('House', 429),
 ('World', 427),
 ('Video', 424),
 ('America', 424),
 ('Memo', 417),
 ('White', 405),
 ('One', 404),
 ('David', 380),
 ('#ReleaseTheMemo', 358),
 ('@POTUS', 350),
 ('Full', 349),
 ('John', 346),
 ('Live', 345),
 ('Truth', 344),
 ('American', 341),
 ('NEW', 339),
 ('#pizzagate', 339),
 ('one', 333),
 ('CNN', 330),
 ('NEWS', 320),
 ('Part', 317),
 ('Tom', 309),
 ('like', 304),
 ('USA', 299),
 ('Year', 297),
 ('Muel

In [30]:
sorted(domains_words['http://worldtruth.tv' ], key=lambda x: x[1], reverse=True)

[('10', 9),
 ('Truth', 6),
 ('Common', 6),
 ('Headache', 6),
 ('Types', 6),
 ('Surprising', 6),
 ('Causes', 6),
 ('Karma', 5),
 ('Hear', 5),
 ('Mainstream', 5),
 ('Version', 5),
 ('American', 5),
 ('Cancer', 4),
 ('World', 4),
 ('Boy', 4),
 ('Foods', 4),
 ('Investigation', 3),
 ('Product', 3),
 ('Kills', 3),
 ('Viruses,', 3),
 ('Fungal', 3),
 ('Infections,', 3),
 ('Bacteria', 3),
 ('One', 3),
 ('First', 3),
 ('#Autism', 3),
 ('Big', 3),
 ('Pharma', 3),
 ('Obama', 2),
 ('Multimillion', 2),
 ('Dollar', 2),
 ('White', 2),
 ('House', 2),
 ('Parties', 2),
 ("Here's", 2),
 ('Much', 2),
 ('Couple', 2),
 ('Gets', 2),
 ('Benefits', 2),
 ('Weight', 2),
 ('Work', 2),
 ('Put', 2),
 ('Ice', 2),
 ('Drinks', 2),
 ('KFC,', 2),
 ("McDonald's", 2),
 ('Burger', 2),
 ('King', 2),
 ('Warned', 2),
 ('Restaurant', 2),
 ('Sell', 2),
 ('Human', 2),
 ('Meat', 2),
 ('Consumption', 2),
 ('Opens', 2),
 ('Japan', 2),
 ('FDA', 2),
 ('Outlawed', 2),
 ('Hemp', 2),
 ('Oil', 2),
 ('#1', 2),
 ('Treatment', 2),
 ('Germany

In [28]:
domains_sorted = sorted([x for x in domains_words.items()], key= lambda x: len(x[1]), reverse=True)

In [49]:
#print(domains_sorted[0])

for i in range(300):
    print(i)
    print(domains_sorted[i][0])
    print(len(domains_sorted[i][1]))
    print("\n")
#print(domains_sorted[0][1])

0
https://twitter.com
391174


1
http://fb.me
259164


2
https://fb.me
170180


3
https://youtu.be
130190


4
http://bit.ly
107929


5
https://goo.gl
79515


6
https://www.youtube.com
67582


7
http://ow.ly
57381


8
http://dlvr.it
55932


9
http://youtu.be
49803


10
http://www.breitbart.com
40052


11
http://www.thegatewaypundit.com
37850


12
http://ift.tt
37800


13
http://www.foxnews.com
28708


14
http://dailycaller.com
28564


15
http://beforeitsnews.com
25218


16
http://fxn.ws
23369


17
http://yournewswire.com
22585


18
https://www.instagram.com
21873


19
https://lnkd.in
21699


20
https://shar.es
21024


21
https://truepundit.com
20856


22
https://www.facebook.com
20473


23
http://dailym.ai
20150


24
http://dld.bz
18903


25
http://po.st
18581


26
http://ln.is
18426


27
http://thehill.com
17453


28
https://buff.ly
17179


29
https://www.rt.com
17081


30
http://www.infowars.com
16780


31
https://www.theguardian.com
16631


32
https://gab.ai
16039


33
https://www.ze

https://www.haaretz.com
2920




In [32]:
domains_set = domains_sorted[100:300]
selected_d = select_dimensions(domains_set, 150 , 50)
vecs = insts_to_vecs(domains_set, selected_d)

current instance: http://ijr.com
current instance: https://nyti.ms
current instance: https://constitution.com
current instance: http://abcn.ws
current instance: http://www.cbc.ca
current instance: http://conservativetribune.com
current instance: http://www.bbc.com
current instance: http://madworldnews.com
current instance: http://nyti.ms
current instance: http://Disclose.tv
current instance: https://en-volve.com
current instance: http://m.beforeitsnews.com
current instance: http://WorldTruth.Tv
current instance: https://nypost.com
current instance: https://pamelageller.com
current instance: http://truepundit.com
current instance: https://www.washingtontimes.com
current instance: http://dists.sytes.net
current instance: https://www.huffingtonpost.com
current instance: https://clashdaily.com
current instance: http://NaturalNews.com
current instance: http://politi.co
current instance: http://www.telegraph.co.uk
current instance: https://www.newsmax.com
current instance: http://nabdapp.com

current instance: https://www.haaretz.com
domains above cutoff:
 ['Texas', 'New', 'One', 'GOP', 'Hillary', 'Clinton', 'go', 'Russian', 'Trump', 'Obama', 'Former', 'National', 'Police', 'White', "Trump's", 'President', 'May', 'First', 'House', 'Good', 'make', '#MAGA', 'like', 'many', 'years', 'good', 'help', 'going', '&amp;', 'money', 'take', 'keep', 'York', 'Congress', 'Christmas', 'us', 'got', 'would', 'Get', 'want', 'US', 'States', 'News', 'Democrats', 'FBI', 'Bill', 'Media', 'North', 'Korea', 'People', 'John', 'need', 'think', 'even', 'Americans', 'people', 'last', 'get', 'Black', 'America', 'Government', 'new', 'American', 'one', 'media', 'Twitter', 'Top', 'Donald', 'Republican', 'Senate', 'Secret', 'via', 'Justice', 'said', 'State', 'California', 'Court', 'Time', 'Another', 'show', 'Paul', 'says', 'Great', 'United', 'Trump,', 'Man', '10', '2016', 'Department', 'stop', '@realDonaldTrump', 'Facebook', 'world', 'George', 'Washington', 'Russia', 'Trump’s', 'could', 'back', 'know', 're

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


building vector for user: https://nyti.ms
building vector for user: https://constitution.com
building vector for user: http://abcn.ws
building vector for user: http://www.cbc.ca
building vector for user: http://conservativetribune.com
building vector for user: http://www.bbc.com
building vector for user: http://madworldnews.com
building vector for user: http://nyti.ms
building vector for user: http://Disclose.tv
building vector for user: https://en-volve.com
building vector for user: http://m.beforeitsnews.com
building vector for user: http://WorldTruth.Tv
building vector for user: https://nypost.com
building vector for user: https://pamelageller.com
building vector for user: http://truepundit.com
building vector for user: https://www.washingtontimes.com
building vector for user: http://dists.sytes.net
building vector for user: https://www.huffingtonpost.com
building vector for user: https://clashdaily.com
building vector for user: http://NaturalNews.com
building vector for user: http:

building vector for user: https://www.truthrevolt.org
building vector for user: http://www.nnettle.com
building vector for user: http://cmun.it
building vector for user: http://canadafreepress.com
building vector for user: http://www.blabber.buzz
building vector for user: https://elpais.com
building vector for user: http://eheadlines.com
building vector for user: http://russia-insider.com
building vector for user: http://theantimedia.org
building vector for user: https://www.thestar.com
building vector for user: http://pinterest.com
building vector for user: https://fellowshipoftheminds.com
building vector for user: https://theintercept.com
building vector for user: http://www.independentsentinel.com
building vector for user: http://www.jpost.com
building vector for user: http://nynettle.com
building vector for user: https://thinkprogress.org
building vector for user: http://USFREEDOMARMY.COM
building vector for user: https://www.cnsnews.com
building vector for user: http://aol.it
buil

In [53]:
len(vecs[10])

2705

In [83]:
num_clusters = [2,3,4,10,20]

for k in num_clusters:
    kmeans = KMeans(n_clusters=k, random_state=5).fit(vecs)
    clusters = [[] for _ in range(k)]
    for d, l in zip([x[0] for x in domains_set], kmeans.labels_):
        clusters[l].append(d)
        
    print("\n\n******K =", k, "********")
    print("Size of resulting clusters:\n ")
    for cn, c in enumerate(clusters):
        print("Size of cluster ", cn, ":", len(c))
        
    print("The clusters are: ")
    for cn, c in enumerate(clusters):
        print("\n************cluster ", cn, ":")
        print("\n".join([ x for x in c]))
    print("\n\n", "-"*100)



******K = 2 ********
Size of resulting clusters:
 
Size of cluster  0 : 1
Size of cluster  1 : 199
The clusters are: 

************cluster  0 :
http://jwatch.us

************cluster  1 :
http://ijr.com
https://nyti.ms
https://constitution.com
http://abcn.ws
http://www.cbc.ca
http://conservativetribune.com
http://www.bbc.com
http://madworldnews.com
http://nyti.ms
http://Disclose.tv
https://en-volve.com
http://m.beforeitsnews.com
http://WorldTruth.Tv
https://nypost.com
https://pamelageller.com
http://truepundit.com
https://www.washingtontimes.com
http://dists.sytes.net
https://www.huffingtonpost.com
https://clashdaily.com
http://NaturalNews.com
http://politi.co
http://www.telegraph.co.uk
https://www.newsmax.com
http://nabdapp.com
https://sputniknews.com
http://truthfeed.com
https://www.bloomberg.com
https://www.reddit.com
https://freedomdaily.com
http://video.foxnews.com
http://www.lifenews.com
https://thehornnews.com
https://cards.twitter.com
https://en.wikipedia.org
http://therightsc



******K = 4 ********
Size of resulting clusters:
 
Size of cluster  0 : 197
Size of cluster  1 : 1
Size of cluster  2 : 1
Size of cluster  3 : 1
The clusters are: 

************cluster  0 :
http://ijr.com
https://nyti.ms
https://constitution.com
http://abcn.ws
http://www.cbc.ca
http://conservativetribune.com
http://www.bbc.com
http://madworldnews.com
http://nyti.ms
http://Disclose.tv
https://en-volve.com
http://m.beforeitsnews.com
http://WorldTruth.Tv
https://nypost.com
https://pamelageller.com
http://truepundit.com
https://www.washingtontimes.com
http://dists.sytes.net
https://www.huffingtonpost.com
https://clashdaily.com
http://NaturalNews.com
http://politi.co
http://www.telegraph.co.uk
https://www.newsmax.com
http://nabdapp.com
https://sputniknews.com
http://truthfeed.com
https://www.bloomberg.com
https://www.reddit.com
https://freedomdaily.com
http://video.foxnews.com
http://www.lifenews.com
https://thehornnews.com
https://cards.twitter.com
https://en.wikipedia.org
http://therigh



******K = 20 ********
Size of resulting clusters:
 
Size of cluster  0 : 142
Size of cluster  1 : 1
Size of cluster  2 : 1
Size of cluster  3 : 1
Size of cluster  4 : 1
Size of cluster  5 : 1
Size of cluster  6 : 3
Size of cluster  7 : 4
Size of cluster  8 : 1
Size of cluster  9 : 1
Size of cluster  10 : 2
Size of cluster  11 : 1
Size of cluster  12 : 1
Size of cluster  13 : 1
Size of cluster  14 : 1
Size of cluster  15 : 2
Size of cluster  16 : 1
Size of cluster  17 : 1
Size of cluster  18 : 1
Size of cluster  19 : 33
The clusters are: 

************cluster  0 :
https://nyti.ms
http://abcn.ws
http://www.cbc.ca
http://www.bbc.com
http://nyti.ms
http://Disclose.tv
https://nypost.com
https://www.huffingtonpost.com
http://NaturalNews.com
http://politi.co
http://www.telegraph.co.uk
http://nabdapp.com
https://sputniknews.com
https://www.bloomberg.com
https://www.reddit.com
https://thehornnews.com
https://cards.twitter.com
https://en.wikipedia.org
http://therightscoop.com
http://www.latime

In [54]:
selected_d

['!!',
 '!!!',
 '"A',
 '"I',
 '"The',
 '"Trump',
 '"We',
 '#AmericaFirst',
 '#BuildTheWall',
 '#DrainTheSwamp',
 '#FBI',
 '#FakeNews',
 '#MeToo',
 '#NotABot',
 '#Obama',
 '#ObamaGate',
 '#PJNET',
 '#QAnon',
 '#RedNationRising',
 '#ReleaseTheMemo',
 '#SOTU',
 '#SchumerShutdown',
 '#TCOT',
 '#Trump',
 '#TrumpTrain',
 '#ccot',
 '#feedly',
 '#ma4t',
 '#maga',
 '#pedogate',
 '#pizzagate',
 '#tcot',
 '$1',
 '$100',
 "'I",
 "'The",
 '(and',
 '--',
 '..',
 '...',
 '1,000',
 '100',
 '100%',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '20',
 '200',
 '2008',
 '2013',
 '2014',
 '2015',
 '2017,',
 '2020',
 '21',
 '22',
 '23',
 '24',
 '25',
 '27',
 '2nd',
 '30',
 '3rd',
 '40',
 '45',
 '50',
 '500',
 '60',
 '70',
 '9/11',
 '@BarackObama',
 '@CNN',
 '@DonaldJTrumpJr',
 '@FBI',
 '@FoxNews',
 '@GOP',
 '@HillaryClinton',
 '@NancyPelosi',
 '@POTUS',
 '@PrisonPlanet',
 '@RealJamesWoods',
 '@SpeakerRyan',
 '@TuckerCarlson',
 '@VP',
 '@WhiteHouse',
 '@nytimes',
 '@seanhannity',
 '

In [52]:
domains_sorted_string = []
for dwords in domains_sorted:
    newstr = "_".join(dwords[1])
    domains_sorted_string.append((dwords[0], newstr))

In [57]:
domains_sorted_string[1000]

('http://www.nbcwashington.com',
 '@jtblogs:_ammo_rounds_man_had_|_at_say_trump_nbc4_rt_police_of_hotel,_guns,_washington_90')

In [49]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(domains_sorted[0:20])

ValueError: could not convert string to float: 'https:/'

In [60]:
model = AgglomerativeClustering(n_clusters=2)
model.fit_predict(domains_sorted_string)

MemoryError: 

In [69]:
ex = [['hello', 'world'], ['hello', 'what'], ['cat', 'dog'], ['cat', 'mouse']]
ex2 = [[0, 1, 3, 0], [1, 1, 4, 1], [10, 1, 5, 9], [0, 3, 5, 1] ]
ex3 = [[4], [12], [3], [1] ]
model = AgglomerativeClustering(n_clusters=2)
model.fit_predict(ex)

ValueError: could not convert string to float: 'hello'

In [None]:
import nltk

nltk.download('wordnet')

In [None]:
from nltk.corpus import wordnet as wn

In [None]:
wn.synsets('dog')

In [None]:
wn.synset('dog.n.01').definition()

In [None]:
wn.synset('dog.n.01').examples()

In [None]:
wn.synsets('puppy')

In [None]:
d_syn = wn.synset('president.n.01')
p_syn = wn.synset('government.n.01')
d_syn.wup_similarity(p_syn)

In [21]:
from nltk.tag import StanfordNERTagger

In [22]:
import os
java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
os.environ['JAVAHOME'] = java_path
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz")
stmodel = ("stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz")
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz")
stjar = ("stanford-ner-2017-06-09/stanford-ner.jar")

In [23]:
st = StanfordNERTagger(stmodel, stjar)

In [27]:
%%time
testsentence = "Donald Trump is the President of the U.S. He Won the Elections After Defeating Hillary Clinton. The FBI invetigation May Have Played a Role"
print("\n".join([str(x) for x in st.tag(testsentence.lower().split())]))
#print("\n".join([str(x) for x in st.tag(testsentence.split())]))
print(testsentence.lower())

('donald', 'O')
('trump', 'O')
('is', 'O')
('the', 'O')
('president', 'O')
('of', 'O')
('the', 'O')
('u.s.', 'MISC')
('he', 'O')
('won', 'O')
('the', 'O')
('elections', 'O')
('after', 'O')
('defeating', 'O')
('hillary', 'O')
('clinton.', 'O')
('the', 'O')
('fbi', 'O')
('invetigation', 'O')
('may', 'O')
('have', 'O')
('played', 'O')
('a', 'O')
('role', 'O')
donald trump is the president of the u.s. he won the elections after defeating hillary clinton. the fbi invetigation may have played a role
Wall time: 2.51 s


In [80]:
testsentence = random.choices(population=heavy, k=1)[0]
print(testsentence)

Dan Rather Thinks The Media Should Publicly Shame Trump Supporters https://fb.me/DR74QnRP


In [81]:
%%time
print("\n".join([str(x) for x in st.tag(testsentence.split())]))

('Dan', 'PERSON')
('Rather', 'O')
('Thinks', 'O')
('The', 'O')
('Media', 'ORGANIZATION')
('Should', 'ORGANIZATION')
('Publicly', 'ORGANIZATION')
('Shame', 'ORGANIZATION')
('Trump', 'ORGANIZATION')
('Supporters', 'ORGANIZATION')
('https:fb.meDR74QnRP', 'O')
Wall time: 3.53 s


In [228]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [265]:
print(ne_chunk(pos_tag(word_tokenize(testsentence))))

(S
  ICE/NNP
  Director/NNP
  Wants/NNP
  To/TO
  Lock/VB
  (PERSON Up/NNP California/NNP)
  Gov/NNP
  ./.
  (PERSON Jerry/NNP Brown/NNP)
  http/NN
  :/:
  //usalibertypress.com/2018/01/17/ice-director-wants-lock-california-gov-jerry-brown//JJ)


In [28]:
import spotlight

In [37]:
annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate',
                                'd trump defeated clinton, fbi may have played a role',
                                 confidence=0.7, support=20)

In [38]:
print(annotations)

[{'URI': 'http://dbpedia.org/resource/Trump', 'support': 542, 'types': '', 'surfaceForm': 'trump', 'offset': 2, 'similarityScore': 0.8737726427560173, 'percentageOfSecondRank': 0.1410833206527077}, {'URI': 'http://dbpedia.org/resource/Federal_Bureau_of_Investigation', 'support': 15865, 'types': '', 'surfaceForm': 'fbi', 'offset': 26, 'similarityScore': 0.9999907221834919, 'percentageOfSecondRank': 5.371277967872109e-06}]
