In [1]:
import sklearn
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import random
import time
import numpy as np
import sys

In [2]:
%%time
suspect_timelines = pickle.load(open("sus_timelines_pt1.p", "rb"))

Wall time: 24.8 s


In [3]:
len(suspect_timelines)

1420

In [4]:
users_to_remove = []
for u in suspect_timelines:
    if len(suspect_timelines[u]) <3:
        users_to_remove.append(u)
        
for u in users_to_remove:
    del suspect_timelines[u]
        
print(len(suspect_timelines))

1394


In [5]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
def get_domain(u):
    endmarker = u[8:].find("/")
    if endmarker == -1:
        return u
    else:
        return u[0:8+endmarker]
    

def get_domain_drop_protocol(u):
    u = u.lower()
    startmarker = u.find(":")+3
    if u[startmarker:].startswith("www"):
        startmarker += u[startmarker:].find(".")+1
    endmarker = u[startmarker:].find("/")
    if endmarker == -1:
        return u[startmarker:]
    else:
        return u[startmarker:startmarker+endmarker]
    
    
    

def get_links_domains_count_for_user(tweets):
    links = defaultdict(lambda : 0)
    domains = defaultdict(lambda : 0)
    for t in tweets:
        if len(t[5]) == 0:
            continue
        else:
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                domain = get_domain(url)
                links[url] += 1
                domains[domain] += 1
    return(links, domains)


def get_domain_counts_for_users(timelines):
    domains = dict()
    for user, tweets in timelines.items():
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        domain_counts = [(d, c) for d,c in userdomains.items()]
        domains[user] = domain_counts
    return domains
        


def get_links_domains_for_users(sets_of_tweets):
    domains = defaultdict(lambda: [])
    links = defaultdict(lambda: [])
    for tweets in sets_of_tweets:
        userlinks, userdomains = get_links_domains_count_for_user(tweets)
        for l in userlinks:
            links[l].append(userlinks[l])
        for d in userdomains:
            domains[d].append(userdomains[d])
    return (links, domains)    
    

def get_links_domains_associated_words(sets_of_tweets):
    links = defaultdict(lambda : defaultdict(lambda: 0))
    domains = defaultdict(lambda: defaultdict(lambda: 0))
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                continue
            text = t[4].split()
            text_filtered = []
            for word in text:
                if not word.startswith('http') and not word.lower() in stop and len(word)>1:
                    text_filtered.append(word)
            for urlobj in t[5]:
                #url = urlobj['expanded_url']
                url = urlobj
                for w in text_filtered:
                    links[url][w] +=1
                domain = get_domain_drop_protocol(url)
                for w in text_filtered:
                    domains[domain][w] +=1
                
    links_to_word_count = dict()
    for k, d in links.items():
        word_counts = []
        for w,c in d.items():
            word_counts.append((w, c))
        links_to_word_count[k] = word_counts
      
    domains_to_word_count = dict()
    for k, d in domains.items():
        word_counts = []
        for w, c in d.items():
            word_counts.append((w,c))
        domains_to_word_count[k] = word_counts
    
    return links_to_word_count, domains_to_word_count

                

def characterize_links_by_comment(sets_of_tweets):
    url_count = 0
    nourl_count = 0
    uncommented = 0
    commented_light = []
    commented_heavy = []
    for tweets in sets_of_tweets:
        for t in tweets:
            if len(t[5]) == 0:
                nourl_count += 1
            else:
                url_count +=  1
                text = t[4].split()
                words = [w.strip() for w in text if not w.startswith("http")]
                where_to_append = ""
                if len(words) == 0:
                    uncommented += 1
                    continue
                elif len(words) <= 4:
                    where_to_append = commented_light
                else: #len(words)>4
                    where_to_append =commented_heavy
                reconstrcuted_tweet = " ".join(words)
                for url in t[5]:
                    reconstrcuted_tweet += " " + url
                where_to_append.append(reconstrcuted_tweet)
    totalnumber = nourl_count + url_count
    print("Total number of tweets: ", totalnumber)
    print("Tweets without urls: ", nourl_count, " = ", (nourl_count/totalnumber)*100, "%")
    print("Tweets with urls: ", url_count, " = ", (url_count/totalnumber)*100, "%")
    print("URL tweets containing no comment: ", uncommented, " = ", str((uncommented/url_count)*100), "%")
    print("URL tweets containing 4 words or fewer: ", len(commented_light), " = ", (len(commented_light)/url_count)*100, "%")
    print("URL tweets containing more than 4 words: ", len(commented_heavy), " = ", (len(commented_heavy)/url_count)*100, "%")
    return commented_light, commented_heavy



def get_all_tweets_with_links(sets_of_tweets):
    tweets_with_links = dict()
    for s in sets_of_tweets:
        for t in s:
            if len(t[5])>0:
                tweets_with_links[t[3]] = t
    return tweets_with_links

In [7]:
%%time
links, domains = get_links_domains_for_users(suspect_timelines.values())
print("count of links found: ", len(links))
print("count of domains found: ", len(domains))

count of links found:  1812550
count of domains found:  52566
Wall time: 30.4 s


In [8]:
#testing the get_domain function:
for l in random.choices(population = list(links.keys()), k=1000):
    print(l)
    print(get_domain_drop_protocol(l), "\n\n")

https://fb.me/1tqkVZX16
fb.me 


http://fb.me/2HOVEquHe
fb.me 


https://twitter.com/erickaelizabth/status/953040090711937029
twitter.com 


https://fb.me/3jQaaOLfg
fb.me 


http://lnk.ms/bMJZh
lnk.ms 


https://gab.ai/SueG52/posts/19030447
gab.ai 


https://fb.me/EJWjNelt
fb.me 


http://fb.me/8gkGajKbY
fb.me 


https://twitter.com/AntiGlobalist__/status/937727582950133760
twitter.com 


http://fb.me/8ePbMChS4
fb.me 


https://twitter.com/JoshuaMacias/status/960955033125904384
twitter.com 


https://ilovemyfreedom.org/watch-politically-correct-students-get-owned-comedian-epic-rant/
ilovemyfreedom.org 


http://nyp.st/2fxN0zG
nyp.st 


http://bit.ly/2s9EPRP
bit.ly 


http://dailycaller.com/2018/02/09/three-democrats-rouhani-louis-farrakhan/?utm_campaign=atdailycaller&utm_source=Twitter&utm_medium=Social
dailycaller.com 


http://cmun.it/pvgpnob
cmun.it 


https://twitter.com/justice69hall/status/960630711244877824
twitter.com 


http://truthfeednews.com/breaking-another-roy-moore-accus

https://fb.me/8VagSTrbL
fb.me 


https://townhall.com/tipsheet/mattvespa/2018/01/04/make-america-great-again-strong-jobs-report-catapults-dow-jones-past-25000-n2430181
townhall.com 


http://thebea.st/23MoSLw
thebea.st 


http://youtu.be/oXUKx8TBc_A?a
youtu.be 


https://fb.me/91CEtBtCD
fb.me 


https://www.washingtonpost.com/graphics/2017/world/national-security/donald-trump-pursues-vladimir-putin-russian-election-hacking/?utm_term=.eea992eaa121
washingtonpost.com 


https://fb.me/Dx8D2Q77
fb.me 


https://www.washingtonpost.com/local/people-here-live-in-fear-ms-13-menaces-a-community-seven-miles-from-the-white-house/2017/12/20/6cebf318-d956-11e7-b859-fb0995360725_story.html?utm_term=.87160a70d14f&wpisrc=al_trending_now__alert-local--alert-national&wpmk=1
washingtonpost.com 


http://fb.me/IJawgB5Y
fb.me 


http://video.foxnews.com/v/5730933577001/
video.foxnews.com 


http://thedisclosenews.com/1076-2/
thedisclosenews.com 


https://www.zsss.si/iz-delavske-enotnosti-st-30-leto-2017-s

In [9]:
def select_dimensions(instances, max_cutoff_ratio, min_cutoff_ratio):
    features = defaultdict(lambda: 0)
    for ddict in instances:
        current_instance = ddict[0]
        print("current instance:", current_instance)
        for (d, c) in ddict[1]:
            features[d] +=1
            
    max_cutoff = len(instances)*max_cutoff_ratio
    min_cutoff = len(instances)*min_cutoff_ratio
    print("\n\nNumber of instances: ", len(instances))
    print("Total number of possible features: ", len(features))
    print("All features occuring in", max_cutoff, "instances or more, and all features occuring in", min_cutoff, 
          "instances or fewer, will be dropped")
    features_above_cutoff = [f for f in features if features[f]>max_cutoff]
    features_below_cutoff =  [f for f in features if features[f]<min_cutoff]
    selected_features = sorted([f for f in features if features[f]>=min_cutoff and features[f]<= max_cutoff])
    print("\nfeatures above cutoff:\n", len(features_above_cutoff))
    print("Examples: 20 features that were above the cutoff:\n", features_above_cutoff[:20])
    print("\nfeatures below cutoff:\n",  len(features_below_cutoff))
    print("Examples: 20 features that were below the cutoff:\n", features_below_cutoff[:20], "\n\n")
    print("\nNumber of remaining features: ", len(selected_features), "\n\n")
    print("Examples: 20 features remaining:\n", selected_features[:20], "\n\n")
    return selected_features


def insts_to_vecs(insts, dimensions):
    vecs = np.zeros(shape = [len(insts), len(dimensions)], dtype='uint')
    for i, (inst, words) in enumerate(insts):
        t_start = time.time()
        print("building vector for instance:", inst)
        words_dict = {w[0]:w[1] for w in words}
        vec = np.array([0 if not d in words_dict else words_dict[d] for d in dimensions])
        vecs[i] = (vec)
        t = time.time()- t_start
        print("finished in ", t, "\n")
    return vecs        

In [10]:
e = [('a', 5), ('b', 8), ('c', 10)]
eset = {x[0] for x in e}
e2 = ['b', 'c', 'd']

[0 for x in e2 if x not in eset]

[0]

In [11]:
t1 = time.time()

In [12]:
time.time()-t1

0.02506732940673828

In [13]:
edict = {x[0]:x[1] for x in e}
edict

{'a': 5, 'b': 8, 'c': 10}

In [14]:
n = np.zeros(shape=[5, 8])
print(n)
n[0:8:2] = [4,3,67,1,1,1,5,8]
print(n)

[[ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.]]
[[  4.   3.  67.   1.   1.   1.   5.   8.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  4.   3.  67.   1.   1.   1.   5.   8.]
 [  0.   0.   0.   0.   0.   0.   0.   0.]
 [  4.   3.  67.   1.   1.   1.   5.   8.]]


In [15]:
%%time
links_words, domains_words = get_links_domains_associated_words(suspect_timelines.values())

Wall time: 5min 31s


In [17]:
sorted(domains_words['breitbart.com'], key=lambda x: x[1], reverse=True)

[('@BreitbartNews', 2675),
 ('Breitbart', 2592),
 ('Trump', 2533),
 ('via', 2011),
 ('#MAGA', 1011),
 ('&amp;', 983),
 ('#AAG', 731),
 ('FBI', 689),
 ('Democrats', 634),
 ('#feedly', 632),
 ('Illegal', 586),
 ('@realDonaldTrump', 563),
 ('#Patriot', 563),
 ('Donald', 555),
 ('#news', 550),
 ('President', 538),
 ('Obama', 533),
 ('U.S.', 479),
 ('State', 465),
 ('DACA', 464),
 ('Americans', 431),
 ('Clinton', 417),
 ('GOP', 389),
 ('House', 387),
 ('New', 384),
 ('Memo', 379),
 ('Amnesty', 375),
 ('American', 360),
 ('Report:', 337),
 ('Hillary', 329),
 ('like', 326),
 ('White', 322),
 ('John', 321),
 ('News', 320),
 ('Dossier', 311),
 ('Roy', 308),
 ('#ma4t', 308),
 ('Immigration', 303),
 ('Moore', 301),
 ('Trump’s', 298),
 ('Media', 294),
 ('America', 289),
 ("Trump's", 281),
 ('FISA', 269),
 ('Says', 268),
 ('Border', 263),
 ('Aliens', 261),
 ('One', 254),
 ('Bill', 253),
 ('Soros', 243),
 ('Democrat', 241),
 ('get', 235),
 ('Trump:', 229),
 ('Adam', 228),
 ('Congress', 220),
 ('ille

In [18]:
sorted(domains_words['infowars.com' ], key=lambda x: x[1], reverse=True)

[('via', 4177),
 ('@realalexjones', 4113),
 ('Trump', 1727),
 ('&amp;', 641),
 ('Clinton', 576),
 ('FBI', 529),
 ('FISA', 410),
 ('#QAnon', 361),
 ('#MAGA', 356),
 ('Memo', 345),
 ('@realDonaldTrump', 343),
 ('#Qanon8chan', 341),
 ('Hillary', 331),
 ('Bill', 329),
 ('QAnon', 327),
 ('President', 324),
 ('Says', 321),
 ('State', 308),
 ('Obama', 303),
 ('Report:', 294),
 ('Video:', 274),
 ('Alex', 266),
 ('says', 265),
 ('Deep', 252),
 ('#Trump', 250),
 ('Watch', 237),
 ('Lynch', 234),
 ('US', 231),
 ('CNN', 227),
 ('House', 212),
 ('Russian', 212),
 ('Backup', 212),
 ('#Infowars', 205),
 ('Infowars', 204),
 ('@MarilynLavala', 203),
 ('Trump’s', 198),
 ('White', 196),
 ('New', 184),
 ('LIVE', 181),
 ('@POTUS', 166),
 ('Schiff', 165),
 ('Live:', 160),
 ('M-F', 160),
 ('war', 159),
 ('Plan', 157),
 ('Twitter', 157),
 ('Tarmac', 156),
 ('Video', 155),
 ('Seat', 154),
 ('Tune', 150),
 ('News', 150),
 ('Russia', 148),
 ('Scalia’s', 147),
 ('Jones', 147),
 ('MURDERED', 146),
 ('CIA', 145),
 (

In [19]:
sorted(domains_words['bbc.com' ], key=lambda x: x[1], reverse=True)

[('BBC', 799),
 ('News', 726),
 ('US', 120),
 ('Trump', 103),
 ('says', 50),
 ('de', 50),
 ('&amp;', 44),
 ('Syria', 37),
 ('people', 35),
 ('shows', 35),
 ('via', 34),
 ('story', 32),
 ('women', 31),
 ('one', 31),
 ('child', 31),
 ('man', 30),
 ('Germany', 29),
 ('time', 29),
 ('FBI', 29),
 ('like', 29),
 ('President', 29),
 ('attack', 29),
 ('killed', 28),
 ('Islamic', 28),
 ('say', 28),
 ('New', 26),
 ('country', 26),
 ('State', 26),
 ('left', 26),
 ('Korea', 26),
 ('North', 26),
 ('could', 24),
 ('First', 23),
 ('money', 23),
 ('never', 23),
 ('disaster', 22),
 ('relief', 22),
 ('sex', 22),
 ('abuse', 22),
 ('new', 21),
 ('UK', 21),
 ('saw', 21),
 ('looted', 21),
 ('plundered', 21),
 ('shambles.', 21),
 ('covered', 21),
 ("Trump's", 21),
 ('South', 21),
 ('en', 21),
 ('arrested', 20),
 ('leaving', 20),
 ('now,', 20),
 ('pension', 20),
 ('tell', 20),
 ('good.', 20),
 ('Russian', 20),
 ('la', 20),
 ('Iran', 19),
 ('really', 19),
 ('McCabe', 19),
 ('later', 19),
 ('planned.', 19),
 ('

In [20]:
sorted(domains_words['worldtruth.tv' ], key=lambda x: x[1], reverse=True)

[('via', 460),
 ('@WorldTruthTV', 366),
 ('&amp;', 290),
 ('Found', 259),
 ('Man', 231),
 ('New', 208),
 ('People', 198),
 ('Cancer', 194),
 ('World', 186),
 ('Old', 161),
 ('Year', 154),
 ('One', 142),
 ('Woman', 140),
 ('Dead', 127),
 ('Years', 124),
 ('Secret', 122),
 ('Water', 117),
 ('See', 116),
 ('10', 115),
 ('Know', 112),
 ('Scientists', 112),
 ('Discovered', 108),
 ('Reveals', 94),
 ('Human', 93),
 ('Never', 92),
 ('US', 92),
 ('Ancient', 88),
 ('Children', 88),
 ('Million', 88),
 ('Find', 87),
 ('Elite', 86),
 ('Pedophile', 85),
 ('@https://twitter.com/WorldTruthTV', 85),
 ('Inside', 84),
 ('Use', 84),
 ('Body', 83),
 ('Life', 82),
 ('CIA', 80),
 ('First', 80),
 ('Government', 79),
 ('Make', 77),
 ('Truth', 76),
 ('Mysterious', 75),
 ('Death', 74),
 ("Here's", 73),
 ('War', 72),
 ('Hidden', 72),
 ('Facebook', 71),
 ('Says', 71),
 ('Could', 70),
 ('Ring', 69),
 ('Shocking', 69),
 ('Vegas', 69),
 ('Health', 68),
 ('Video', 68),
 ('Police', 68),
 ('Want', 67),
 ('Time', 67),
 (

In [21]:
domains_sorted = sorted([x for x in domains_words.items()], key= lambda x: len(x[1]), reverse=True)

In [22]:
#print(domains_sorted[0])

for i in range(500):
    print(i)
    print(domains_sorted[i][0])
    print("number of words in tweets that shared this domain: ", len(domains_sorted[i][1]))
    print("\n")
#print(domains_sorted[0][1])

0
twitter.com
number of words in tweets that shared this domain:  391195


1
fb.me
number of words in tweets that shared this domain:  356204


2
youtu.be
number of words in tweets that shared this domain:  152522


3
bit.ly
number of words in tweets that shared this domain:  108002


4
goo.gl
number of words in tweets that shared this domain:  84185


5
youtube.com
number of words in tweets that shared this domain:  71734


6
ow.ly
number of words in tweets that shared this domain:  57381


7
dlvr.it
number of words in tweets that shared this domain:  55932


8
thegatewaypundit.com
number of words in tweets that shared this domain:  40334


9
breitbart.com
number of words in tweets that shared this domain:  40105


10
ift.tt
number of words in tweets that shared this domain:  37800


11
foxnews.com
number of words in tweets that shared this domain:  28790


12
dailycaller.com
number of words in tweets that shared this domain:  28567


13
instagram.com
number of words in tweets that sh

number of words in tweets that shared this domain:  3216


263
russia-insider.com
number of words in tweets that shared this domain:  3202


264
independentsentinel.com
number of words in tweets that shared this domain:  3192


265
naturalblaze.com
number of words in tweets that shared this domain:  3182


266
newstarget.com
number of words in tweets that shared this domain:  3177


267
thestar.com
number of words in tweets that shared this domain:  3151


268
eheadlines.com
number of words in tweets that shared this domain:  3146


269
theantimedia.org
number of words in tweets that shared this domain:  3138


270
conservativepost.com
number of words in tweets that shared this domain:  3138


271
slate.com
number of words in tweets that shared this domain:  3108


272
thinkprogress.org
number of words in tweets that shared this domain:  3107


273
fellowshipoftheminds.com
number of words in tweets that shared this domain:  3087


274
theintercept.com
number of words in tweets that sha

In [23]:
%%time
domains_set = domains_sorted[0:1000]
selected_d = select_dimensions(domains_set, 0.995 , 0.005)
vecs = insts_to_vecs(domains_set, selected_d)

current instance: twitter.com
current instance: fb.me
current instance: youtu.be
current instance: bit.ly
current instance: goo.gl
current instance: youtube.com
current instance: ow.ly
current instance: dlvr.it
current instance: thegatewaypundit.com
current instance: breitbart.com
current instance: ift.tt
current instance: foxnews.com
current instance: dailycaller.com
current instance: instagram.com
current instance: beforeitsnews.com
current instance: zerohedge.com
current instance: lnkd.in
current instance: truepundit.com
current instance: infowars.com
current instance: fxn.ws
current instance: shar.es
current instance: buff.ly
current instance: yournewswire.com
current instance: facebook.com
current instance: dailym.ai
current instance: dld.bz
current instance: po.st
current instance: ln.is
current instance: conservativetribune.com
current instance: paper.li
current instance: thehill.com
current instance: dailywire.com
current instance: rt.com
current instance: theguardian.com
curre

current instance: pacificpundit.com
current instance: humansarefree.com
current instance: stripes.com
current instance: theneonnettle.com
current instance: informationliberation.com
current instance: petrescuereport.com
current instance: bigleaguepolitics.com
current instance: spectator.org
current instance: dcstatesman.com
current instance: goodreads.com
current instance: clkme.in
current instance: news.com.au
current instance: m.huffpost.com
current instance: yhoo.it
current instance: lasvegassun.com
current instance: exchangle.co
current instance: conta.cc
current instance: liveaction.org
current instance: drudge.tw
current instance: allnewspipeline.com
current instance: libya24.tv
current instance: presstv.com
current instance: hasrdaily.com
current instance: naver.me
current instance: tacticalinvestor.com
current instance: truthuncensored.net
current instance: americanactionnews.com
current instance: thetimes.co.uk
current instance: secure.actblue.com
current instance: talkingpoin

current instance: rare.us
current instance: ed.gr
current instance: liveleak.com
current instance: petitions.moveon.org
current instance: thetrumpet.com
current instance: mashable.com
current instance: vets4childrescue.org
current instance: halturnerradioshow.com
current instance: subjectpolitics.com
current instance: asheepnomore.net
current instance: cbn.com
current instance: pbs.twimg.com
current instance: neon-nettle.com
current instance: rightobserver.com
current instance: wn.nr
current instance: theage.com.au
current instance: blocnotesimma.wordpress.com
current instance: allenbwest.americanewshub.com
current instance: spreaker.com
current instance: gopthedailydose.com
current instance: l.facebook.com
current instance: mondoweiss.net
current instance: dangerous.com
current instance: geopolitics.co
current instance: lawnewz.com
current instance: la.eonline.com
current instance: heritage.org
current instance: conventionofstates.com
current instance: cis.org
current instance: napava

building vector for instance: youtu.be
finished in  0.11562204360961914 

building vector for instance: bit.ly
finished in  0.08472561836242676 

building vector for instance: goo.gl
finished in  0.06567740440368652 

building vector for instance: youtube.com
finished in  0.06317400932312012 

building vector for instance: ow.ly
finished in  0.05783700942993164 

building vector for instance: dlvr.it
finished in  0.055150508880615234 

building vector for instance: thegatewaypundit.com
finished in  0.05142998695373535 

building vector for instance: breitbart.com
finished in  0.048668861389160156 

building vector for instance: ift.tt
finished in  0.04612231254577637 

building vector for instance: foxnews.com
finished in  0.04916787147521973 

building vector for instance: dailycaller.com
finished in  0.04311537742614746 

building vector for instance: instagram.com
finished in  0.04189014434814453 

building vector for instance: beforeitsnews.com
finished in  0.04311633110046387 

bu

finished in  0.029740333557128906 

building vector for instance: elpais.com
finished in  0.029230117797851562 

building vector for instance: nnettle.com
finished in  0.030079126358032227 

building vector for instance: canadafreepress.com
finished in  0.030086278915405273 

building vector for instance: vox.com
finished in  0.02907586097717285 

building vector for instance: cmun.it
finished in  0.031082630157470703 

building vector for instance: mintpressnews.com
finished in  0.029581308364868164 

building vector for instance: louderwithcrowder.com
finished in  0.03258919715881348 

building vector for instance: mobile.nytimes.com
finished in  0.038103342056274414 

building vector for instance: tmblr.co
finished in  0.03158402442932129 

building vector for instance: commun.it
finished in  0.02907848358154297 

building vector for instance: russia-insider.com
finished in  0.0350954532623291 

building vector for instance: independentsentinel.com
finished in  0.031083106994628906 

finished in  0.030079126358032227 

building vector for instance: freedom-daily.com
finished in  0.030080556869506836 

building vector for instance: nymag.com
finished in  0.029150009155273438 

building vector for instance: guysandgoodhealth.com
finished in  0.030046701431274414 

building vector for instance: sgtreport.com
finished in  0.031083345413208008 

building vector for instance: rumormillnews.com
finished in  0.03113722801208496 

building vector for instance: nova24tv.si
finished in  0.02906179428100586 

building vector for instance: articles.mercola.com
finished in  0.029105424880981445 

building vector for instance: yesimright.com
finished in  0.029590129852294922 

building vector for instance: smh.com.au
finished in  0.033089399337768555 

building vector for instance: thepetitionsite.com
finished in  0.030080318450927734 

building vector for instance: breaking911.com
finished in  0.029616117477416992 

building vector for instance: rightwingnews.com
finished in  0.

building vector for instance: secure.actblue.com
finished in  0.029111862182617188 

building vector for instance: talkingpointsmemo.com
finished in  0.028110504150390625 

building vector for instance: on.ktla.com
finished in  0.029077529907226562 

building vector for instance: palmerreport.com
finished in  0.027677297592163086 

building vector for instance: huff.to
finished in  0.02904367446899414 

building vector for instance: ntknetwork.com
finished in  0.029111146926879883 

building vector for instance: onecrankyoldman.newsninjaa.com
finished in  0.02958226203918457 

building vector for instance: jonrappoport.wordpress.com
finished in  0.031110525131225586 

building vector for instance: paulcraigroberts.org
finished in  0.028070688247680664 

building vector for instance: crooksandliars.com
finished in  0.029111385345458984 

building vector for instance: ascensionwithearth.com
finished in  0.029552698135375977 

building vector for instance: quora.com
finished in  0.0290787

finished in  0.029076576232910156 

building vector for instance: support45.com
finished in  0.029114961624145508 

building vector for instance: trumptrainnews.com
finished in  0.02907705307006836 

building vector for instance: downtrend.com
finished in  0.02911233901977539 

building vector for instance: politicalvelcraft.org
finished in  0.030081748962402344 

building vector for instance: stiltonsplace.blogspot.com
finished in  0.02864360809326172 

building vector for instance: indiegogo.com
finished in  0.030113697052001953 

building vector for instance: renewedright.com
finished in  0.029076814651489258 

building vector for instance: angrypatriotmovement.com
finished in  0.02911233901977539 

building vector for instance: christianpost.com
finished in  0.029077529907226562 

building vector for instance: aclu.org
finished in  0.032085418701171875 

building vector for instance: magapill.com
finished in  0.028675079345703125 

building vector for instance: libertyonenews.com
f

finished in  0.03609871864318848 

building vector for instance: blocnotesimma.wordpress.com
finished in  0.032623291015625 

building vector for instance: allenbwest.americanewshub.com
finished in  0.029078245162963867 

building vector for instance: spreaker.com
finished in  0.029109716415405273 

building vector for instance: gopthedailydose.com
finished in  0.029082775115966797 

building vector for instance: l.facebook.com
finished in  0.02907848358154297 

building vector for instance: mondoweiss.net
finished in  0.02907729148864746 

building vector for instance: dangerous.com
finished in  0.02958059310913086 

building vector for instance: geopolitics.co
finished in  0.029619455337524414 

building vector for instance: lawnewz.com
finished in  0.028069019317626953 

building vector for instance: la.eonline.com
finished in  0.03108358383178711 

building vector for instance: heritage.org
finished in  0.028682231903076172 

building vector for instance: conventionofstates.com
fin

finished in  0.029108285903930664 

building vector for instance: pyrrhicchange.blogspot.com
finished in  0.030199527740478516 

building vector for instance: seattletimes.com
finished in  0.03062152862548828 

building vector for instance: operationrescue.org
finished in  0.02907562255859375 

building vector for instance: reclaimourrepublic.wordpress.com
finished in  0.03308892250061035 

building vector for instance: comicallyincorrect.com
finished in  0.031085729598999023 

building vector for instance: mercurynews.com
finished in  0.028287410736083984 

building vector for instance: electronicintifada.net
finished in  0.02807307243347168 

building vector for instance: greatamericanrepublic.com
finished in  0.02961587905883789 

building vector for instance: worldtoday365.info
finished in  0.02768707275390625 

building vector for instance: usalibertypress.com
finished in  0.02810835838317871 

building vector for instance: directorblue.blogspot.com
finished in  0.0280745029449462

In [24]:
print("length of feature vector: ", len(vecs[0]))
for v in vecs:
    assert len(v) == len(vecs[0])

length of feature vector:  124390


In [25]:
sys.getsizeof(vecs)/(1024*1024)

474.5102996826172

In [26]:
from sklearn.metrics.pairwise import pairwise_distances

In [27]:
%%time
euc = pairwise_distances(vecs, metric='euclidean')

Wall time: 11.8 s


In [28]:
%%time
cos = pairwise_distances(vecs, metric='cosine')

Wall time: 12.2 s


In [29]:
%%time
city = pairwise_distances(vecs, metric='cityblock')

Wall time: 2min 53s


In [30]:
%%time
canberra = pairwise_distances(vecs, metric='canberra')

Wall time: 1min 26s


In [31]:
%%time
chebyshev = pairwise_distances(vecs, metric='chebyshev')

Wall time: 1min 26s


In [32]:
%%time
bray = pairwise_distances(vecs, metric='braycurtis')

Wall time: 1min 26s


In [33]:
%%time
jaccard = pairwise_distances(vecs, metric='jaccard')



Wall time: 1min 23s


In [34]:
%%time
corr = pairwise_distances(vecs, metric='correlation')

Wall time: 1min 28s


In [35]:
%%time
mahalanobis =  pairwise_distances(vecs, metric='minkowski')

Wall time: 53min 42s


In [36]:
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import sigmoid_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel
from sklearn.metrics.pairwise import chi2_kernel

In [37]:
%%time
poly = polynomial_kernel(vecs)

Wall time: 17.2 s


In [38]:
%%time
poly10 = polynomial_kernel(vecs, degree=10)

Wall time: 7.4 s


In [39]:
%%time 
sigmoid = sigmoid_kernel(vecs)

Wall time: 3.98 s


In [40]:
%%time
rbf = rbf_kernel(vecs)

Wall time: 5.4 s


In [41]:
%%time 
laplacian = laplacian_kernel(vecs)

Wall time: 3min 44s


In [42]:
chi2 = chi2_kernel(vecs)

In [43]:
def get_nearest_n(instance, instances_list, matrix, n, is_kernel):
    i = instances_list.index(instance)
    if i==-1:
        print(instance, " not found!")
        return
    neighbors = matrix[i]
    neighbors_indexed = [(instances_list[j], neighbors[j]) for j in range(len(instances_list))]
    neighbors_sorted = sorted(neighbors_indexed, key=lambda x: x[1], reverse=is_kernel)
    return neighbors_sorted[0:n]
    

In [44]:
domains_list = [x[0] for x in domains_set[:1000]]

In [46]:
metrics = {"euclidean": euc, "cosine": cos, "cityblock": city, "canberra": canberra, "chebyshev": chebyshev, "bray-curtis":bray,
           "jaccard": jaccard, "correlation": corr, "minkowski": mahalanobis, 
           "poly-kernel": poly, "poly-kernel-degree10":poly10, "sigmoid-kernel": sigmoid, "rbf-kernel": rbf, 
           "laplacian-kernel": laplacian, "chi-squared-kernel": chi2}

for metricname, metric in metrics.items():
    print("******** Current metric: ", metricname)
    is_kernel = (metricname.find("kernel") > -1)
    print("\n".join([str(x) for x in get_nearest_n("nytimes.com", domains_list, metric, 30, is_kernel)]))
    print("\n"*4)

******** Current metric:  euclidean
('nytimes.com', 0.0)
('nyti.ms', 676.71190324982467)
('cnn.com', 739.7851039322162)
('a.msn.com', 752.62208312007431)
('wapo.st', 755.83728407640751)
('newsweek.com', 761.37966875928601)
('nypost.com', 767.08278562355963)
('google.com', 771.41169293704638)
('rawstory.com', 795.93278610696768)
('washingtonpost.com', 797.4289937041417)
('usatoday.com', 800.76213696702723)
('dailykos.com', 801.07303037862914)
('politico.com', 805.97704681957293)
('gab.ai', 807.37475808945135)
('hotair.com', 812.53799911142619)
('disq.us', 817.80132061522136)
('independent.co.uk', 818.93650547524135)
('redstate.com', 819.13979270940069)
('axios.com', 829.12845808113468)
('americanthinker.com', 831.3657438215746)
('occuworld.org', 832.53108050090236)
('blabber.buzz', 835.0491003527876)
('mobile.nytimes.com', 836.34382881683291)
('frontpagemag.com', 836.6415002855166)
('conservativeinstitute.org', 837.27534300252751)
('shareblue.com', 838.58154045984099)
('dmlnews.com', 83

In [None]:
metrics = {"euclidean": euc, "cosine": cos, "cityblock": city, "canberra": canberra, "chebyshev": chebyshev, "bray-curtis":bray,
           "jaccard": jaccard, "correlation": corr, "minkowski": mahalanobis, 
           "poly-kernel": poly, "poly-kernel-degree10":poly10, "sigmoid-kernel": sigmoid, "rbf-kernel": rbf, 
           "laplacian-kernel": laplacian, "chi-squared-kernel": chi2}

for metricname, metric in metrics.items():
    print("******** Current metric: ", metricname)
    is_kernel = (metricname.find("kernel") > -1)
    print("\n".join([str(x) for x in get_nearest_n("www.bbc.com", domains_list, metric, 30, is_kernel)]))
    print("\n"*4)

In [None]:
get_nearest_n("www.nytimes.com", domains_list, poly, 1000, False)

In [None]:
for metricname, metric in metrics.items():
    print("******** Current metric: ", metricname)
    is_kernel = (metricname.find("kernel") > -1)
    print("\n".join([str(x) for x in get_nearest_n("madworldnews.com", domains_list, metric, 30, is_kernel)]))
    print("\n"*4)

In [None]:
import os
java_path = "C:/ProgramData/Oracle/Java/javapath/java.exe"
os.environ['JAVAHOME'] = java_path
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.all.3class.distsim.crf.ser.gz")
stmodel = ("stanford-ner-2017-06-09/classifiers/english.conll.4class.distsim.crf.ser.gz")
#stmodel = ("stanford-ner-2017-06-09/classifiers/english.muc.7class.distsim.crf.ser.gz")
stjar = ("stanford-ner-2017-06-09/stanford-ner.jar")

In [None]:
st = StanfordNERTagger(stmodel, stjar)

In [None]:
%%time
testsentence = "Donald Trump is the President of the U.S. He Won the Elections After Defeating Hillary Clinton. Russia May Have Played a Role"
print("\n".join([str(x) for x in st.tag(testsentence.lower().split())]))
print("\n".join([str(x) for x in st.tag(testsentence.split())]))
#print(testsentence.lower())

In [None]:
testsentence = random.choices(population=heavy, k=1)[0]
print(testsentence)

In [None]:
%%time
print("\n".join([str(x) for x in st.tag(testsentence.split())]))

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk

In [None]:
print(ne_chunk(pos_tag(word_tokenize(testsentence))))

In [None]:
import spotlight

In [None]:
annotations = spotlight.annotate('http://model.dbpedia-spotlight.org/en/annotate',
                                'trump defeated clinton.',
                                 confidence=0.7, support=20)

In [None]:
print(annotations)