In [1]:
import sklearn
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import random

In [2]:
%%time
suspect_timelines = pickle.load(open("sus_timelines_pt1.p", "rb"))

Wall time: 3min 12s


In [3]:
len(suspect_timelines)

1420

In [4]:
users_to_remove = []
for u in suspect_timelines:
    if len(suspect_timelines[u]) <3:
        users_to_remove.append(u)
        
for u in users_to_remove:
    del suspect_timelines[u]
        
print(len(suspect_timelines))

1394


In [5]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
def get_domain(u):
    endmarker = u[8:].find("/")
    if endmarker == -1:
        return u
    else:
        return u[0:8+endmarker]

def get_links_domains_count_for_user(tweets):
    links = defaultdict(lambda : 0)
    domains = defaultdict(lambda : 0)
    for t in tweets:
        if len(t[5]) == 0:
            continue
        else:
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                domain = get_domain(url)
                links[url] += 1
                domains[domain] += 1
    return(links, domains)


def get_domain_counts_for_users(timelines):
    domains = dict()
    for user, tweets in timelines.items():
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        domain_counts = [(d, c) for d,c in userdomains.items()]
        domains[user] = domain_counts
    return domains
        


def get_links_domains_for_users(sets_of_tweets):
    domains = defaultdict(lambda: [])
    links = defaultdict(lambda: [])
    for tweets in sets_of_tweets:
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        for l in userlinks:
            links[l].append(userlinks[l])
        for d in userdomains:
            domains[d].append(userdomains[d])
    return (links, domains)    
    

def get_links_domains_associated_words(sets_of_tweets):
    links = defaultdict(lambda : defaultdict(lambda: 0))
    domains = defaultdict(lambda: defaultdict(lambda: 0))
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                continue
            text = t[4].split()
            text_filtered = []
            for word in text:
                if not word.startswith('http') and not word.lower() in stop and len(word)>1:
                    text_filtered.append(word)
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                for w in text_filtered:
                    links[url][w] +=1
                domain = get_domain(url)
                for w in text_filtered:
                    domains[domain][w] +=1
                
    links_to_word_count = dict()
    for k, d in links.items():
        word_counts = []
        for w,c in d.items():
            word_counts.append((w, c))
        links_to_word_count[k] = word_counts
      
    domains_to_word_count = dict()
    for k, d in domains.items():
        word_counts = []
        for w, c in d.items():
            word_counts.append((w,c))
        domains_to_word_count[k] = word_counts
    
    return links_to_word_count, domains_to_word_count

                

def characterize_links_by_comment(sets_of_tweets):
    url_count = 0
    nourl_count = 0
    uncommented = 0
    commented_light = []
    commented_heavy = []
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                nourl_count += 1
            else:
                url_count +=  1
                text = t[4].split()
                words = [w.strip() for w in text if not w.startswith("http")]
                where_to_append = ""
                if len(words) == 0:
                    uncommented += 1
                    continue
                elif len(words) <= 4:
                    where_to_append = commented_light
                else: #len(words)>4
                    where_to_append =commented_heavy
                reconstrcuted_tweet = " ".join(words)
                for url in t[5]:
                    reconstrcuted_tweet += " " + url
                where_to_append.append(reconstrcuted_tweet)
    totalnumber = nourl_count + url_count
    print("Total number of tweets: ", totalnumber)
    print("Tweets without urls: ", nourl_count, " = ", (nourl_count/totalnumber)*100, "%")
    print("Tweets with urls: ", url_count, " = ", (url_count/totalnumber)*100, "%")
    print("URL tweets containing no comment: ", uncommented, " = ", str((uncommented/url_count)*100), "%")
    print("URL tweets containing 4 words or fewer: ", len(commented_light), " = ", (len(commented_light)/url_count)*100, "%")
    print("URL tweets containing more than 4 words: ", len(commented_heavy), " = ", (len(commented_heavy)/url_count)*100, "%")
    return commented_light, commented_heavy


In [7]:
%%time
links, domains = get_links_domains_for_users(suspect_timelines.values())
print("count of links found: ", len(links))
print("count of domains found: ", len(domains))

count of links found:  1812550
count of domains found:  52566
Wall time: 3min 51s


In [8]:
sorted_links = sorted([x for x in links.items()], key= lambda x: len(x[1]), reverse=True)
for l in sorted_links[0:100]:
    print("\n", l[0], "\nCount of users: ", str(len(l[1])), " -- Total occurances: ", str(sum(l[1])))


 http://USFREEDOMARMY.COM 
Count of users:  232  -- Total occurances:  1894

 https://truepundit.com/exclusive-six-u-s-agencies-conspired-to-illegally-wiretap-trump-british-intel-used-as-front-to-spy-on-campaign-for-nsa/ 
Count of users:  218  -- Total occurances:  369

 http://www.foxnews.com/politics/2018/02/07/more-texts-between-strzok-and-page-uncovered-lead-to-more-questions.html 
Count of users:  212  -- Total occurances:  365

 http://bigliebook.com 
Count of users:  211  -- Total occurances:  335

 http://jwatch.us/3Savgi 
Count of users:  209  -- Total occurances:  346

 http://jwatch.us/8zpFUN 
Count of users:  198  -- Total occurances:  303

 https://twitter.com/comey/status/959498570532577285 
Count of users:  183  -- Total occurances:  297

 http://jwatch.us/BMQ6hK 
Count of users:  183  -- Total occurances:  303

 http://thehill.com/homenews/administration/372861-uranium-one-informant-makes-clinton-allegations-in-testimony 
Count of users:  176  -- Total occurances:  262

In [10]:
sorted_domains = sorted([x for x in domains.items()], key= lambda x: len(x[1]), reverse=True)
for d in sorted_domains[0:100]:
    print(d[0], "\tCount of users: ", str(len(d[1])), " -- Total occurances: ", str(sum(d[1])))

https://twitter.com 	Count of users:  1238  -- Total occurances:  326762
https://youtu.be 	Count of users:  1182  -- Total occurances:  80719
http://bit.ly 	Count of users:  1164  -- Total occurances:  60403
https://www.youtube.com 	Count of users:  1138  -- Total occurances:  32160
https://fb.me 	Count of users:  922  -- Total occurances:  225228
http://www.foxnews.com 	Count of users:  919  -- Total occurances:  15502
http://www.breitbart.com 	Count of users:  911  -- Total occurances:  21731
http://dailycaller.com 	Count of users:  904  -- Total occurances:  17235
http://www.thegatewaypundit.com 	Count of users:  897  -- Total occurances:  35587
https://goo.gl 	Count of users:  891  -- Total occurances:  34498
http://thehill.com 	Count of users:  887  -- Total occurances:  7976
http://fxn.ws 	Count of users:  877  -- Total occurances:  12239
http://ow.ly 	Count of users:  869  -- Total occurances:  25602
https://www.facebook.com 	Count of users:  846  -- Total occurances:  6483
http

In [11]:
#sorted_domains = sorted([x for x in domains.items()], key= lambda x: len(x[1]), reverse=True)
for d in sorted_domains[500:600]:
    print(d[0], "\tCount of users: ", str(len(d[1])), " -- Total occurances: ", str(sum(d[1])))

http://www.americanpatriotdaily.com 	Count of users:  79  -- Total occurances:  637
https://www.prageru.com 	Count of users:  79  -- Total occurances:  173
https://www.libertyheadlines.com 	Count of users:  79  -- Total occurances:  464
http://www.pacificpundit.com 	Count of users:  79  -- Total occurances:  382
https://aim4truth.org 	Count of users:  79  -- Total occurances:  210
http://www.wsj.com 	Count of users:  79  -- Total occurances:  102
https://talkingpointsmemo.com 	Count of users:  78  -- Total occurances:  141
http://ref.gl 	Count of users:  78  -- Total occurances:  206
http://constitution.com 	Count of users:  78  -- Total occurances:  1196
https://assets.documentcloud.org 	Count of users:  78  -- Total occurances:  95
https://af-mg.com 	Count of users:  78  -- Total occurances:  106
http://www.truthrevolt.org 	Count of users:  77  -- Total occurances:  444
https://docs.google.com 	Count of users:  77  -- Total occurances:  112
http://rightwingnews.com 	Count of users:  

In [12]:
%%time
dc = get_domain_counts_for_users(suspect_timelines)

Wall time: 57.8 s


In [13]:
users_sorted_by_domain_count = sorted([x for x in dc.items()], key=lambda x: len(x[1]), reverse=True)

In [14]:
rand_user = users_sorted_by_domain_count[-10]
print("username: ", rand_user[0])
print(len(rand_user[1]), "domains")
print("\n".join([str(x) for x in sorted(rand_user[1], key= lambda x: x[1], reverse=True)]))

username:  JockyBoyes
3 domains
('http://www.palmerreport.com', 2)
('http://abcn.ws', 1)
('https://goo.gl', 1)


In [15]:
rand_user = random.choices(population=[x for x in dc.items()], k=1)[0]
print("username: ", rand_user[0])
print(len(rand_user[1]), "domains")
print("\n".join([str(x) for x in sorted(rand_user[1], key= lambda x: x[1], reverse=True)]))

username:  LucyATorres2
349 domains
('https://www.americasfreedomfighters.com', 248)
('https://ilovemyfreedom.org', 227)
('https://youtu.be', 146)
('http://a.msn.com', 137)
('https://twitter.com', 120)
('http://fxn.ws', 36)
('http://bit.ly', 35)
('http://www.thegatewaypundit.com', 34)
('https://conservativeflag.com', 31)
('http://45.wh.gov', 23)
('http://redstatewatcher.com', 23)
('https://freedom-daily.com', 21)
('https://instagram.com', 20)
('https://veteranaf.com', 18)
('https://www.americanow.com', 17)
('https://freedomdaily.com', 16)
('https://libertywriters.com', 13)
('http://dailycaller.com', 11)
('http://www.rightwingtribune.com', 11)
('http://go.shr.lc', 10)
('http://thehill.com', 10)
('https://topalertnews.com', 10)
('https://conservativetribune.com', 9)
('http://www.washingtonexaminer.com', 9)
('https://patriotbeat.com', 9)
('https://www.whitehouse.gov', 9)
('https://daggernews.com', 9)
('https://www.rushlimbaugh.com', 8)
('https://mediaconservative.com', 8)
('http://www.bre

In [16]:
%%time
light, heavy = characterize_links_by_comment(suspect_timelines.values())

Total number of tweets:  3655718
Tweets without urls:  1290351  =  35.296787115417544 %
Tweets with urls:  2365367  =  64.70321288458246 %
URL tweets containing no comment:  324575  =  13.721972108345131 %
URL tweets containing 4 words or fewer:  174253  =  7.36684835799265 %
URL tweets containing more than 4 words:  1866539  =  78.91117953366222 %
Wall time: 2min 41s


In [17]:
print("\n\n".join(random.choices(population=light, k=15)))

Shooting Blanks  #shooting http://ow.ly/fUEX30hkNV9

🗣Amen🙏 https://twitter.com/Mildred06963744/status/953057738602532864

London https://twitter.com/RNcat50/status/950504954191519744

Señales de los Tiempos http://ow.ly/MMQv30hvyCr

Why Trump Matters: http://eepurl.com/bqD9tj

アメブロを更新しました。 『美しさがUPする！過去生アチューンメント♪』 #アチューンメント http://ameblo.jp/crystalchildrensomehow/entry-11755508373.html

Useful idiots. http://fb.me/4dbWIvaZg

Sad association! https://twitter.com/JackPosobiec/status/950554093197713408

Big Division Coming https://gitardood.wordpress.com/2017/12/17/big-division-coming/

Time Ragazine too. https://twitter.com/iannjohnsonnn/status/934674598196543488

Fact or fiction...? http://fb.me/1DMelyCjv

Pharma manager roles http://www.occuworld.org/news/3953602

#ReleaseTheMemo https://twitter.com/MOVEFORWARDHUGE/status/955179806924070912

For you @FukushimaExposd https://www.youtube.com/watch?v=xDJO9-tZEpw

Heartbreaking. https://twitter.com/Relationshllp/status/842339500982394882


In [18]:
print("\n\n".join(random.choices(population=heavy, k=15)))

Judicial Watch released 78 pages of new Hillary Clinton docs from the State Dept containing emails that include even more classified info &amp; that show Clinton had knowledge about the serious security problems w/ her unsecured non-State(dot)gov email system. http://jwatch.us/3Savgi

Well bless their little communist soulless fake news hearts, the Media is going down next, and hey NOBODY believes you anymore!! http://www.breitbart.com/big-government/2018/02/03/16-nunes-memo-bombshells-media-not-want-know/

Yet another conspiracy theory is revealed as fact. http://www.zerohedge.com/news/2017-11-01/twitter-admits-it-buried-leaked-clinton-email-tweets-last-two-months-campaign

#WomensMarch2018 HAH! TRUMP Trolls Women's Marchers - "Get Out There and Celebrate - Lowest Female Unemployment in 18 Years!" http://www.thegatewaypundit.com/2018/01/omg-trump-trolls-womens-marchers-get-celebrate-lowest-female-unemployment-18-years/

Retweeted Astrid (@annastolichnaya): Remember kids, it’s only rap

In [23]:
def select_dimensions(users, max_cutoff, min_cutoff):
    domains = defaultdict(lambda: 0)
    for ddict in users:
        current_user = ddict[0]
        print("current instance:", current_user)
        for (d, c) in ddict[1]:
            domains[d] +=1
    print("\nfeatures above cutoff:\n", len([d for d in domains if domains[d]>max_cutoff]))
    print("features below cutoff:\n",  len([d for d in domains if domains[d]<min_cutoff]), "\n\n")
    selected_domains = sorted([x for x in domains if domains[x]>=min_cutoff and domains[x]<= max_cutoff])
    return selected_domains


def insts_to_vecs(insts, dimensions):
    vecs = []
    for u, udomains in insts:
        print("building vector for instance:", u)
        vec = []
        for d in dimensions:
            ud = [x for x in udomains if x[0]==d]
            if ud == []:
                vec.append(0)
            else:
                assert len(ud) == 1
                vec.append(ud[0][1])
        
        vecs.append(vec)
    return vecs        
        

In [24]:
users_set = users_sorted_by_domain_count[200:300]
selected_d = select_dimensions(users_set, 80, 10)
vecs = insts_to_vecs(users_set, selected_d)

current instance: chp3871
current instance: pamelava10
current instance: TruthnotFM
current instance: Shasha44373856
current instance: rafawkes
current instance: kevensets
current instance: NanalovesAub
current instance: BrendaAnders2
current instance: KuhnKat1
current instance: 1humanagenda
current instance: alvejz_59
current instance: Traveldealguy
current instance: EtsiSusi
current instance: RajivMessage
current instance: Patriottechsan
current instance: uneedtoknow1
current instance: soft63389
current instance: AlanVernon3
current instance: PALucier
current instance: catoletters
current instance: whatzmatteryou
current instance: CFT1
current instance: RAndrewOhge
current instance: rugburndaddy
current instance: HeidiKole
current instance: cosemote333
current instance: yorkieartist
current instance: Kombinatke
current instance: SIMikeSr
current instance: BreacherAlec1
current instance: Brialalexi
current instance: slouise2004
current instance: statepsyops
current instance: vaxen_var

In [25]:
#length of the feature vector
veclen = len(vecs[0])
print("length of features vector: ", veclen)
for v in vecs:
    assert len(v) == veclen

length of features vector:  505


In [26]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(vecs)

In [27]:
c0 = c1 = []
for u, l in zip([x[0] for x in users_set], kmeans.labels_):
    if l == 0:
        c0.append(u)
    elif l == 1:
        c1.append(u)
    else:
        print("Unknown value for cluster: ", l)

print("Size of cluster 0: ", len(c0))
print("Size of cluster 1: ", len(c1))
print("\n*********************\nCluster O:")
print("\n".join(["http://www.twitter.com/" + x for x in c0]))
print("\n*********************\nCluster 1:")
print("\n".join(["http://www.twitter.com/" + x for x in c1]))

Size of cluster 0:  100
Size of cluster 1:  100

*********************
Cluster O:
http://www.twitter.com/chp3871
http://www.twitter.com/pamelava10
http://www.twitter.com/TruthnotFM
http://www.twitter.com/Shasha44373856
http://www.twitter.com/rafawkes
http://www.twitter.com/kevensets
http://www.twitter.com/NanalovesAub
http://www.twitter.com/BrendaAnders2
http://www.twitter.com/KuhnKat1
http://www.twitter.com/1humanagenda
http://www.twitter.com/alvejz_59
http://www.twitter.com/Traveldealguy
http://www.twitter.com/EtsiSusi
http://www.twitter.com/RajivMessage
http://www.twitter.com/Patriottechsan
http://www.twitter.com/uneedtoknow1
http://www.twitter.com/soft63389
http://www.twitter.com/AlanVernon3
http://www.twitter.com/PALucier
http://www.twitter.com/catoletters
http://www.twitter.com/whatzmatteryou
http://www.twitter.com/CFT1
http://www.twitter.com/RAndrewOhge
http://www.twitter.com/rugburndaddy
http://www.twitter.com/HeidiKole
http://www.twitter.com/cosemote333
http://www.twitter.com/

In [28]:
#looking at the features: the dimensions that survived the max and min cutoffs
print("\n".join([x for x in selected_d]))

http://100percentfedup.com
http://45.wh.gov
http://DCWhispers.com
http://NaturalNews.com
http://Newsmax.com
http://Support45.com
http://USFREEDOMARMY.COM
http://a.msn.com
http://abc13.com
http://abc7.com
http://abcn.ws
http://abcnews.go.com
http://aclj.us
http://agendaofevil.com
http://aje.io
http://allnewspipeline.com
http://americafans.com
http://americanlookout.com
http://amp.dailycaller.com
http://amp.washingtontimes.com
http://amzn.to
http://aol.it
http://apne.ws
http://archive.is
http://barenakedislam.com
http://bb4sp.com
http://bbc.in
http://beforeitsnews.com
http://bigliebook.com
http://breaking911.com
http://buff.ly
http://canadafreepress.com
http://cbsloc.al
http://cbsn.ws
http://cnb.cx
http://cnn.it
http://cnnmon.ie
http://conservative101.com
http://conservativefighters.com
http://conservativetribune.com
http://crwd.fr
http://dailysign.al
http://dailysignal.com
http://deadline.com
http://dennismichaellynch.com
http://diamondandsilk.breakinginfoalert.com
http://diamondandsilk

In [31]:
%%time
links_words, domains_words = get_links_domains_associated_words(suspect_timelines.values())

Wall time: 7min 21s


In [32]:
sorted(domains_words['http://www.breitbart.com'], key=lambda x: x[1], reverse=True)

[('@BreitbartNews', 2672),
 ('Breitbart', 2591),
 ('Trump', 2533),
 ('via', 2008),
 ('#MAGA', 1011),
 ('&amp;', 983),
 ('#AAG', 731),
 ('FBI', 689),
 ('Democrats', 634),
 ('#feedly', 632),
 ('Illegal', 586),
 ('@realDonaldTrump', 563),
 ('#Patriot', 563),
 ('Donald', 555),
 ('#news', 550),
 ('President', 538),
 ('Obama', 530),
 ('U.S.', 479),
 ('State', 465),
 ('DACA', 464),
 ('Americans', 431),
 ('Clinton', 416),
 ('GOP', 389),
 ('House', 387),
 ('New', 384),
 ('Memo', 376),
 ('Amnesty', 375),
 ('American', 359),
 ('Report:', 337),
 ('Hillary', 328),
 ('like', 324),
 ('White', 322),
 ('John', 321),
 ('News', 320),
 ('Dossier', 311),
 ('Roy', 308),
 ('#ma4t', 308),
 ('Immigration', 303),
 ('Moore', 301),
 ('Trump’s', 298),
 ('Media', 294),
 ('America', 289),
 ("Trump's", 281),
 ('FISA', 269),
 ('Says', 268),
 ('Border', 263),
 ('Aliens', 261),
 ('One', 254),
 ('Bill', 253),
 ('Democrat', 241),
 ('Soros', 237),
 ('get', 235),
 ('Trump:', 229),
 ('Adam', 228),
 ('Congress', 220),
 ('ille

In [33]:
sorted(domains_words['http://fb.me' ], key=lambda x: x[1], reverse=True)

[('Trump', 6648),
 ('new', 5294),
 ('Facebook', 4296),
 ('de', 4050),
 ('posted', 3980),
 ('&amp;', 3395),
 ('photo', 3098),
 ('one', 2671),
 ('like', 2551),
 ('people', 2528),
 ('New', 2412),
 ('la', 2392),
 ('Obama', 2345),
 ('Retweeted', 2241),
 ('get', 2154),
 ('know', 2008),
 ('President', 1978),
 ('the...', 1971),
 ('Hillary', 1951),
 ('Clinton', 1919),
 ('US', 1878),
 ('video', 1787),
 ('would', 1727),
 ('en', 1629),
 ('love', 1589),
 ('time', 1585),
 ('see', 1468),
 ('One', 1466),
 ('need', 1421),
 ('News', 1377),
 ('going', 1339),
 ('want', 1335),
 ('think', 1328),
 ('2017', 1317),
 ("I'm", 1305),
 ('White', 1257),
 ('day', 1253),
 ('good', 1216),
 ('Time', 1206),
 ('us', 1183),
 ('go', 1167),
 ('and...', 1164),
 ('great', 1156),
 ('Black', 1156),
 ('World', 1143),
 ('make', 1128),
 ('yang', 1125),
 ('American', 1124),
 ('Donald', 1122),
 ('...', 1115),
 ('Twitter', 1111),
 ('People', 1108),
 ('Muslim', 1084),
 ('many', 1083),
 ('BREAKING:', 1082),
 ('say', 1080),
 ('Please', 

In [34]:
sorted(domains_words['https://youtu.be' ], key=lambda x: x[1], reverse=True)

[('via', 52302),
 ('@YouTube', 51866),
 ('&amp;', 5521),
 ('Trump', 4880),
 ('...', 2261),
 ('FBI', 1911),
 ('Clinton', 1804),
 ('New', 1653),
 ('President', 1576),
 ('Hillary', 1568),
 ('Obama', 1560),
 ('@realDonaldTrump', 1196),
 ('2017', 1166),
 ('Watch', 1082),
 ('BREAKING:', 1049),
 ('News', 1027),
 ('State', 1006),
 ('US', 966),
 ('TRUMP', 877),
 ('video', 870),
 ('FISA', 819),
 ('Donald', 813),
 ('#MarkRobbinsNetwork', 798),
 ('2018', 797),
 ('Day', 763),
 ('Video', 755),
 ('World', 755),
 ('#MAGA', 746),
 ('#QAnon', 739),
 ('Deep', 721),
 ('America', 716),
 ('CIA', 711),
 ('Memo', 683),
 ('@POTUS', 680),
 ('Twitter', 669),
 ('Secret', 658),
 ('House', 643),
 ('BREAKING', 639),
 ('Full', 638),
 ('One', 633),
 ('Dr.', 626),
 ('White', 613),
 ('Truth', 597),
 ('John', 584),
 ('American', 582),
 ('one', 582),
 ('NEW', 581),
 ('Live', 579),
 ('people', 572),
 ('#ReleaseTheMemo', 562),
 ('CNN', 546),
 ('Mueller', 538),
 ('like', 528),
 ('de', 528),
 ('David', 525),
 ('Media', 518),


In [35]:
sorted(domains_words['http://worldtruth.tv' ], key=lambda x: x[1], reverse=True)

[('10', 12),
 ('Cancer', 12),
 ('Truth', 7),
 ('Causes', 7),
 ('&amp;', 7),
 ('Mainstream', 6),
 ('Common', 6),
 ('Headache', 6),
 ('Types', 6),
 ('Surprising', 6),
 ('Found', 6),
 ('Karma', 5),
 ('Hear', 5),
 ('Version', 5),
 ('First', 5),
 ('World', 5),
 ('American', 5),
 ('Big', 5),
 ('Foods', 5),
 ('Rothschild', 5),
 ('Investigation', 4),
 ('Product', 4),
 ('Kills', 4),
 ('One', 4),
 ('Get', 4),
 ('Boy', 4),
 ('Pharma', 4),
 ('Water', 4),
 ('Drugs', 4),
 ('Viruses,', 3),
 ('Fungal', 3),
 ('Infections,', 3),
 ('Bacteria', 3),
 ('Work', 3),
 ('Ice', 3),
 ('Human', 3),
 ('FDA', 3),
 ('Baking', 3),
 ('Soda', 3),
 ('Study', 3),
 ('Humans', 3),
 ('Toxins', 3),
 ('Cells', 3),
 ('Discovered', 3),
 ('#Autism', 3),
 ('Russia', 3),
 ('Whole', 3),
 ('History', 3),
 ('Satanic', 3),
 ('Really', 3),
 ('Death', 3),
 ('Obama', 2),
 ('Multimillion', 2),
 ('Dollar', 2),
 ('White', 2),
 ('House', 2),
 ('Parties', 2),
 ("Here's", 2),
 ('Much', 2),
 ('Couple', 2),
 ('Gets', 2),
 ('Benefits', 2),
 ('Weig

In [36]:
domains_sorted = sorted([x for x in domains_words.items()], key= lambda x: len(x[1]), reverse=True)

In [37]:
#print(domains_sorted[0])

for i in range(300):
    print(i)
    print(domains_sorted[i][0])
    print("number of words in tweets that shared this domain: ", len(domains_sorted[i][1]))
    print("\n")
#print(domains_sorted[0][1])

0
https://twitter.com
number of words in tweets that shared this domain:  391174


1
http://fb.me
number of words in tweets that shared this domain:  259164


2
https://fb.me
number of words in tweets that shared this domain:  170180


3
https://youtu.be
number of words in tweets that shared this domain:  130190


4
http://bit.ly
number of words in tweets that shared this domain:  107929


5
https://goo.gl
number of words in tweets that shared this domain:  79515


6
https://www.youtube.com
number of words in tweets that shared this domain:  67582


7
http://ow.ly
number of words in tweets that shared this domain:  57381


8
http://dlvr.it
number of words in tweets that shared this domain:  55932


9
http://youtu.be
number of words in tweets that shared this domain:  49803


10
http://www.breitbart.com
number of words in tweets that shared this domain:  40052


11
http://www.thegatewaypundit.com
number of words in tweets that shared this domain:  37850


12
http://ift.tt
number of word

number of words in tweets that shared this domain:  3369


262
http://snpy.tv
number of words in tweets that shared this domain:  3369


263
https://www.timesofisrael.com
number of words in tweets that shared this domain:  3369


264
http://www.gopusa.com
number of words in tweets that shared this domain:  3364


265
https://www.themaven.net
number of words in tweets that shared this domain:  3355


266
http://americanlookout.com
number of words in tweets that shared this domain:  3323


267
https://www.hotmomsclub.com
number of words in tweets that shared this domain:  3306


268
https://www.truthrevolt.org
number of words in tweets that shared this domain:  3301


269
http://www.nnettle.com
number of words in tweets that shared this domain:  3285


270
http://cmun.it
number of words in tweets that shared this domain:  3283


271
http://canadafreepress.com
number of words in tweets that shared this domain:  3268


272
http://www.blabber.buzz
number of words in tweets that shared this 

In [38]:
domains_set = domains_sorted[100:300]
selected_d = select_dimensions(domains_set, 160 , 40)
vecs = insts_to_vecs(domains_set, selected_d)

current instance: http://ijr.com
current instance: https://nyti.ms
current instance: https://constitution.com
current instance: http://abcn.ws
current instance: http://www.cbc.ca
current instance: http://conservativetribune.com
current instance: http://www.bbc.com
current instance: http://madworldnews.com
current instance: http://nyti.ms
current instance: http://Disclose.tv
current instance: https://en-volve.com
current instance: http://m.beforeitsnews.com
current instance: http://WorldTruth.Tv
current instance: https://nypost.com
current instance: https://pamelageller.com
current instance: http://truepundit.com
current instance: https://www.washingtontimes.com
current instance: http://dists.sytes.net
current instance: https://www.huffingtonpost.com
current instance: https://clashdaily.com
current instance: http://NaturalNews.com
current instance: http://politi.co
current instance: http://www.telegraph.co.uk
current instance: https://www.newsmax.com
current instance: http://nabdapp.com

building vector for instance: https://constitution.com
building vector for instance: http://abcn.ws
building vector for instance: http://www.cbc.ca
building vector for instance: http://conservativetribune.com
building vector for instance: http://www.bbc.com
building vector for instance: http://madworldnews.com
building vector for instance: http://nyti.ms
building vector for instance: http://Disclose.tv
building vector for instance: https://en-volve.com
building vector for instance: http://m.beforeitsnews.com
building vector for instance: http://WorldTruth.Tv
building vector for instance: https://nypost.com
building vector for instance: https://pamelageller.com
building vector for instance: http://truepundit.com
building vector for instance: https://www.washingtontimes.com
building vector for instance: http://dists.sytes.net
building vector for instance: https://www.huffingtonpost.com
building vector for instance: https://clashdaily.com
building vector for instance: http://NaturalNews.c

building vector for instance: https://www.frontpagemag.com
building vector for instance: https://worldpoliticus.com
building vector for instance: http://abcnews.go.com
building vector for instance: http://thebea.st
building vector for instance: https://interc.pt
building vector for instance: http://snpy.tv
building vector for instance: https://www.timesofisrael.com
building vector for instance: http://www.gopusa.com
building vector for instance: https://www.themaven.net
building vector for instance: http://americanlookout.com
building vector for instance: https://www.hotmomsclub.com
building vector for instance: https://www.truthrevolt.org
building vector for instance: http://www.nnettle.com
building vector for instance: http://cmun.it
building vector for instance: http://canadafreepress.com
building vector for instance: http://www.blabber.buzz
building vector for instance: https://elpais.com
building vector for instance: http://eheadlines.com
building vector for instance: http://russi

In [39]:
print("length of feature vector: ", len(vecs[0]))
for v in vecs:
    assert len(v) == len(vecs[0])

length of feature vector:  3793


In [40]:
num_clusters = [2,3,4,10,20]

for k in num_clusters:
    kmeans = KMeans(n_clusters=k, random_state=5).fit(vecs)
    clusters = [[] for _ in range(k)]
    for d, l in zip([x[0] for x in domains_set], kmeans.labels_):
        clusters[l].append(d)
        
    print("\n\n******K =", k, "********")
    print("Size of resulting clusters:\n ")
    for cn, c in enumerate(clusters):
        print("Size of cluster ", cn, ":", len(c))
        
    print("The clusters are: ")
    for cn, c in enumerate(clusters):
        print("\n************cluster ", cn, ":")
        print("\n".join([ x for x in c]))
    print("\n\n", "-"*100)



******K = 2 ********
Size of resulting clusters:
 
Size of cluster  0 : 1
Size of cluster  1 : 199
The clusters are: 

************cluster  0 :
http://jwatch.us

************cluster  1 :
http://ijr.com
https://nyti.ms
https://constitution.com
http://abcn.ws
http://www.cbc.ca
http://conservativetribune.com
http://www.bbc.com
http://madworldnews.com
http://nyti.ms
http://Disclose.tv
https://en-volve.com
http://m.beforeitsnews.com
http://WorldTruth.Tv
https://nypost.com
https://pamelageller.com
http://truepundit.com
https://www.washingtontimes.com
http://dists.sytes.net
https://www.huffingtonpost.com
https://clashdaily.com
http://NaturalNews.com
http://politi.co
http://www.telegraph.co.uk
https://www.newsmax.com
http://nabdapp.com
https://sputniknews.com
http://truthfeed.com
https://www.bloomberg.com
https://www.reddit.com
https://freedomdaily.com
http://video.foxnews.com
http://www.lifenews.com
https://thehornnews.com
https://cards.twitter.com
https://en.wikipedia.org
http://therightsc



******K = 4 ********
Size of resulting clusters:
 
Size of cluster  0 : 178
Size of cluster  1 : 1
Size of cluster  2 : 1
Size of cluster  3 : 20
The clusters are: 

************cluster  0 :
https://nyti.ms
http://abcn.ws
http://www.cbc.ca
http://www.bbc.com
http://nyti.ms
http://Disclose.tv
http://m.beforeitsnews.com
https://nypost.com
https://pamelageller.com
https://www.washingtontimes.com
https://www.huffingtonpost.com
http://NaturalNews.com
http://politi.co
http://www.telegraph.co.uk
https://www.newsmax.com
http://nabdapp.com
https://sputniknews.com
http://truthfeed.com
https://www.bloomberg.com
https://www.reddit.com
http://video.foxnews.com
https://thehornnews.com
https://cards.twitter.com
https://en.wikipedia.org
http://therightscoop.com
http://www.latimes.com
https://www.nbcnews.com
https://www.rawstory.com
https://www.jihadwatch.org
http://www.businessinsider.com
http://dennismichaellynch.com
http://www.westernjournalism.com
http://washex.am
https://www.forbes.com
https://o



******K = 20 ********
Size of resulting clusters:
 
Size of cluster  0 : 7
Size of cluster  1 : 1
Size of cluster  2 : 16
Size of cluster  3 : 1
Size of cluster  4 : 4
Size of cluster  5 : 1
Size of cluster  6 : 1
Size of cluster  7 : 1
Size of cluster  8 : 1
Size of cluster  9 : 2
Size of cluster  10 : 1
Size of cluster  11 : 1
Size of cluster  12 : 1
Size of cluster  13 : 1
Size of cluster  14 : 1
Size of cluster  15 : 1
Size of cluster  16 : 152
Size of cluster  17 : 2
Size of cluster  18 : 1
Size of cluster  19 : 4
The clusters are: 

************cluster  0 :
http://politi.co
https://www.rawstory.com
http://washex.am
http://hill.cm
https://demu.gr
https://www.democraticunderground.com
https://shareblue.com

************cluster  1 :
http://jwatch.us

************cluster  2 :
http://ijr.com
https://constitution.com
http://conservativetribune.com
http://madworldnews.com
https://en-volve.com
http://truepundit.com
http://dists.sytes.net
http://truthfeed.com
https://freedomdaily.com
ht

In [41]:
#the features vector
selected_d

['!!',
 '!!!',
 '"A',
 '"I',
 '"It',
 '"The',
 '"This',
 '"Trump',
 '"We',
 '"a',
 '"the',
 '#1',
 '#AmericaFirst',
 '#BuildTheWall',
 '#CNN',
 '#Clinton',
 '#DACA',
 '#DeepState',
 '#Democrats',
 '#DrainTheSwamp',
 '#FBI',
 '#FakeNews',
 '#FollowTheWhiteRabbit',
 '#GOP',
 '#GreatAwakening',
 '#Hillary',
 '#ISIS',
 '#MAGA',
 '#MSM',
 '#MeToo',
 '#NoDACA',
 '#NotABot',
 '#Obama',
 '#ObamaGate',
 '#PJNET',
 '#PresidentTrump',
 '#QAnon',
 '#Qanon',
 '#RedNationRising',
 '#ReleaseTheMemo',
 '#Russia',
 '#SOTU',
 '#SchumerShutdown',
 '#TCOT',
 '#TheStorm',
 '#ThursdayThoughts',
 '#Trump',
 '#TrumpTrain',
 '#USA',
 '#UraniumOne',
 '#WednesdayWisdom',
 '#ccot',
 '#feedly',
 '#ma4t',
 '#maga',
 '#news',
 '#pedogate',
 '#pizzagate',
 '#tcot',
 '$1',
 '$1,000',
 '$100',
 '$20',
 "'I",
 "'The",
 "'We",
 '(VIDEO)',
 '(and',
 '--',
 '..',
 '...',
 '....',
 '1,000',
 '1.',
 '100',
 '100%',
 '11',
 '12',
 '13',
 '14',
 '15',
 '150',
 '16',
 '17',
 '18',
 '19',
 '1st',
 '20',
 '200',
 '200,000',
 '200

In [None]:
domains_sorted_string = []
for dwords in domains_sorted:
    newstr = "_".join(dwords[1])
    domains_sorted_string.append((dwords[0], newstr))

In [None]:
domains_sorted_string[1000]

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(domains_sorted[0:20])

In [None]:
model = AgglomerativeClustering(n_clusters=2)
model.fit_predict(domains_sorted_string)

In [None]:
ex = [['hello', 'world'], ['hello', 'what'], ['cat', 'dog'], ['cat', 'mouse']]
ex2 = [[0, 1, 3, 0], [1, 1, 4, 1], [10, 1, 5, 9], [0, 3, 5, 1] ]
ex3 = [[4], [12], [3], [1] ]
model = AgglomerativeClustering(n_clusters=2)
model.fit_predict(ex)

In [42]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\Anas
[nltk_data]     Elghafari\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [43]:
from nltk.corpus import wordnet as wn

In [44]:
wn.synsets('dog')

[Synset('dog.n.01'),
 Synset('frump.n.01'),
 Synset('dog.n.03'),
 Synset('cad.n.01'),
 Synset('frank.n.02'),
 Synset('pawl.n.01'),
 Synset('andiron.n.01'),
 Synset('chase.v.01')]

In [45]:
wn.synset('dog.n.01').definition()

'a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds'

In [46]:
wn.synset('dog.n.01').examples()

['the dog barked all night']

In [47]:
wn.synsets('puppy')

[Synset('puppy.n.01'), Synset('puppy.n.02')]

In [48]:
d_syn = wn.synset('president.n.01')
p_syn = wn.synset('government.n.01')
d_syn.wup_similarity(p_syn)

0.11764705882352941

In [49]:
from nltk.tag import StanfordNERTagger

In [50]:
import os
java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
os.environ['JAVAHOME'] = java_path
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz")
stmodel = ("stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz")
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz")
stjar = ("stanford-ner-2017-06-09/stanford-ner.jar")

In [51]:
st = StanfordNERTagger(stmodel, stjar)

In [62]:
%%time
testsentence = "Donald Trump is the President of the U.S. He Won the Elections After Defeating Hillary Clinton. Russia May Have Played a Role"
print("\n".join([str(x) for x in st.tag(testsentence.lower().split())]))
print("\n".join([str(x) for x in st.tag(testsentence.split())]))
#print(testsentence.lower())

('donald', 'O')
('trump', 'O')
('is', 'O')
('the', 'O')
('president', 'O')
('of', 'O')
('the', 'O')
('u.s.', 'MISC')
('he', 'O')
('won', 'O')
('the', 'O')
('elections', 'O')
('after', 'O')
('defeating', 'O')
('hillary', 'O')
('clinton.', 'O')
('russia', 'O')
('may', 'O')
('have', 'O')
('played', 'O')
('a', 'O')
('role', 'O')
('Donald', 'PERSON')
('Trump', 'PERSON')
('is', 'O')
('the', 'O')
('President', 'O')
('of', 'O')
('the', 'O')
('U.S.', 'LOCATION')
('He', 'O')
('Won', 'O')
('the', 'O')
('Elections', 'O')
('After', 'O')
('Defeating', 'O')
('Hillary', 'PERSON')
('Clinton.', 'O')
('Russia', 'LOCATION')
('May', 'O')
('Have', 'O')
('Played', 'O')
('a', 'O')
('Role', 'O')
Wall time: 5.8 s


In [53]:
testsentence = random.choices(population=heavy, k=1)[0]
print(testsentence)

What did 1969 astronauts hear on dark side of moon? via @worldnetdaily http://po.st/ZKGQKw


In [54]:
%%time
print("\n".join([str(x) for x in st.tag(testsentence.split())]))

('What', 'O')
('did', 'O')
('1969', 'O')
('astronauts', 'O')
('hear', 'O')
('on', 'O')
('dark', 'O')
('side', 'O')
('of', 'O')
('moon?', 'O')
('via', 'O')
('@worldnetdaily', 'O')
('http:po.stZKGQKw', 'O')
Wall time: 3.08 s


In [55]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [56]:
print(ne_chunk(pos_tag(word_tokenize(testsentence))))

(S
  What/WP
  did/VBD
  1969/CD
  astronauts/NNS
  hear/VBP
  on/IN
  dark/JJ
  side/NN
  of/IN
  moon/NN
  ?/.
  via/IN
  @/NNP
  worldnetdaily/RB
  http/VBZ
  :/:
  //po.st/ZKGQKw/NN)


In [63]:
import spotlight

In [64]:
annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate',
                                'trump defeated clinton.',
                                 confidence=0.7, support=20)

SpotlightException: No Resources found in spotlight response: {'@text': 'trump defeated clinton, russia may have interefered', '@confidence': '0.7', '@support': '20', '@types': '', '@sparql': '', '@policy': 'whitelist'}

In [None]:
print(annotations)