In [2]:
import sklearn
import pickle
from collections import defaultdict
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import random, string, itertools
import numpy as np

In [3]:
#reading in the fake news tweets
fn_tweets = pickle.load(open("fn_seen.p", "rb"))
len(fn_tweets)

20600

In [11]:
#reading in the MSM tweets:
msm_tweets = pickle.load(open('msm_seen.p',"rb"))
print(len(msm_tweets))
msm_tweets_sample = random.choices(population= list(msm_tweets.values()), k=50000)
print(len(msm_tweets_sample))

446100
50000


In [12]:
def process(tweet, operations, cgrams, wgrams):
    r = tweet
    grams = []
    
    if 'cap' in operations:
        r = tweet.lower()
        
    if 'spaces' in operations:
        r = r.split()
        r = " ".join([w.strip() for w in r])
        
    if 'rt' in operations:
        r = r.split()
        r = " ".join([w for w in r if not w.lower() == "rt"])
    
    if 'mentions' in operations:
        r = r.split()
        r = " ".join([w for w in r if not w.startswith("@")])
        
    if 'urls' in operations:
        r = r.split()
        r = " ".join([w for w in r if not w.startswith("http")])
        
    if 'poses' in operations:
        r = r.replace("'s", "") #removes posessives
        
    if 'punct' in operations:
        r = "".join([c for c in r if not c in string.punctuation])
        
    if 'boundries' in operations:
        r = "< " + r + " >"
        
    if 'char-grams' in operations:
        for leng in cgrams:
            grams.extend([r[i:i+leng] for i in range(len(r)-leng+1)])
            
    if 'word-grams' in operations:
        r = r.split()
        for leng in wgrams:
            lgrams = [r[i:i+leng] for i in range(len(r)-leng+1)]
            grams.extend([" ".join(g) for g in lgrams])
    
    if grams== []:
        return r
    else:
        return grams
    


In [13]:
alltweets = [x for x in itertools.chain(fn_tweets.values(), msm_tweets_sample)]
print("number of all tweets: ", len(alltweets))

number of all tweets:  70600


In [14]:
alllinks = [y for x in itertools.chain(fn_tweets.values(), msm_tweets.values()) for y in x[5]]
print("number of all links: ", len(alllinks))

number of all links:  489259


In [15]:
from re import sub, MULTILINE
def remove_url_corpus(text):
    new_text = sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', text, flags=MULTILINE)
    return new_text

In [None]:
#testing URL removal
for t in alltweets[0:200]: 
    l = t[5][0]
    print("tweet before removing url:")
    print(t[4])
    print("\ntweet after removing url:")
    t_after = remove_url_corpus(t[4]) 
    print(t_after)
    print(len(t[4]), len(t_after))
    assert len(t[4]) != len(t_after)
    print("*"*50)

In [16]:
operations = ['punct', 'spaces', 'mentions', 'urls', 'char-grams', 'boundries',  'word-grams']
process(alltweets[16][4], operations, [1], [2,3])

['<',
 ' ',
 '🎥',
 ' ',
 'G',
 'o',
 't',
 't',
 'a',
 ' ',
 'G',
 'i',
 'v',
 'e',
 ' ',
 't',
 'h',
 'i',
 's',
 ' ',
 'w',
 'o',
 'r',
 'l',
 'd',
 ' ',
 'b',
 'a',
 'c',
 'k',
 ' ',
 't',
 'o',
 ' ',
 'G',
 'o',
 'd',
 ' ',
 'o',
 'f',
 ' ',
 'A',
 'b',
 'r',
 'a',
 'h',
 'a',
 'm',
 ' ',
 'I',
 's',
 'a',
 'a',
 'c',
 ' ',
 'a',
 'm',
 'p',
 ' ',
 'J',
 'a',
 'c',
 'o',
 'b',
 ' ',
 '✝',
 '🙏',
 '✡',
 '️',
 ' ',
 '>',
 '< 🎥',
 '🎥 Gotta',
 'Gotta Give',
 'Give this',
 'this world',
 'world back',
 'back to',
 'to God',
 'God of',
 'of Abraham',
 'Abraham Isaac',
 'Isaac amp',
 'amp Jacob',
 'Jacob ✝🙏✡️',
 '✝🙏✡️ >',
 '< 🎥 Gotta',
 '🎥 Gotta Give',
 'Gotta Give this',
 'Give this world',
 'this world back',
 'world back to',
 'back to God',
 'to God of',
 'God of Abraham',
 'of Abraham Isaac',
 'Abraham Isaac amp',
 'Isaac amp Jacob',
 'amp Jacob ✝🙏✡️',
 'Jacob ✝🙏✡️ >']

In [103]:
%%time
vecs_to_ids = dict()
operations = ['spaces', 'mentions', 'urls', 'punct', 'poses', 'rt', 'word-grams']
for t in fn_tweets.values():
    v = process(t[4], operations, [], [2,3])
    vecs_to_ids[tuple(v)] = ('fake', t[3])
    
for t in msm_tweets_sample:
    v = process(t[4], operations, [], [2,3])
    vecs_to_ids[tuple(v)] = ('msm', t[3])

Wall time: 19 s


In [75]:
%%time
#constructing vectors of words only
grams1_to_ids = dict()
operations = ['punct', 'poses', 'spaces', 'mentions', 'urls', 'word-grams']
for t in fn_tweets.values():
    v = process(t[4], operations, [], [1])
    grams1_to_ids[tuple(v)] = ('fake', t[3])
    
for t in msm_tweets_sample:
    v = process(t[4], operations, [], [1])
    grams1_to_ids[tuple(v)] = ('msm', t[3])

Wall time: 14 s


In [62]:
print(len(alltweets))
print(len(vecs_to_ids))
print("count of fake news tweets after removing duplicates: ", len([x for x in vecs_to_ids.values() if x[0] == 'fake']))
print("count of msm tweets after removing duplicates: ", len([x for x in vecs_to_ids.values() if x[0] == 'msm']))

70600
47116
count of fake news tweets after removing duplicates:  10431
count of msm tweets after removing duplicates:  36685


In [104]:
def select_dimensions(instances, max_cutoff_ratio, min_cutoff_ratio):
    features = defaultdict(lambda: 0)
    for current_instance in instances:
        for d in set(current_instance):
            features[d] +=1
            
    max_cutoff = len(instances)*max_cutoff_ratio
    min_cutoff = len(instances)*min_cutoff_ratio
    print("\n\nNumber of instances: ", len(instances))
    print("Total number of possible features: ", len(features))
    print("All features occuring in", max_cutoff, "instances or more, and all features occuring in", min_cutoff, 
          "instances or fewer, will be dropped")
    features_above_cutoff = [f for f in features if features[f]>max_cutoff]
    features_below_cutoff =  [f for f in features if features[f]<min_cutoff]
    selected_features = sorted([f for f in features if features[f]>=min_cutoff and features[f]<= max_cutoff])
    print("\nfeatures above cutoff:\n", len(features_above_cutoff))
    print("Examples: 20 features that were above the cutoff:\n", features_above_cutoff[:20])
    print("\nfeatures below cutoff:\n",  len(features_below_cutoff))
    print("Examples: 20 features that were below the cutoff:\n", features_below_cutoff[:20], "\n\n")
    print("\nNumber of remaining features: ", len(selected_features), "\n\n")
    print("Examples: 20 features remaining:\n", selected_features[:20], "\n\n")
    return selected_features


def insts_to_vecs(insts, dimensions):
    vecs = np.zeros(shape = [len(insts), len(dimensions)], dtype='uint')
    for i, (inst, words) in enumerate(insts):
        t_start = time.time()
        print("building vector for instance:", inst)
        words_dict = {w[0]:w[1] for w in words}
        vec = np.array([0 if not d in words_dict else words_dict[d] for d in dimensions])
        vecs[i] = (vec)
        t = time.time()- t_start
        print("finished in ", t, "\n")
    return vecs    

In [105]:
%%time
vecs = []
labels = []
ids = []

for v, l in vecs_to_ids.items():
    vecs.append(v)
    labels.append(l[0])
    ids.append(l[1])
    
sg = select_dimensions(vecs, 1, 0)
print("\n\nnumber of dimensions: ", len(sg))



Number of instances:  46278
Total number of possible features:  829458
All features occuring in 46278 instances or more, and all features occuring in 0 instances or fewer, will be dropped

features above cutoff:
 0
Examples: 20 features that were above the cutoff:
 []

features below cutoff:
 0
Examples: 20 features that were below the cutoff:
 [] 



Number of remaining features:  829458 


Examples: 20 features remaining:
 ['0', '0 0', '0 0 trading', '0 0Apple', '0 0Apple confirms', '0 Story', '0 Story w', '0 These', '0 These kids', '0 about', '0 about legal', '0 coverage', '0 coverage And', '0 data', '0 data Net', '0 dead', '0 trading', '0 trading PG', '000 Venezuela', '000 dollars'] 




number of dimensions:  829458
Wall time: 3.55 s


In [106]:
%%time
vecs = []
labels = []
ids = []

for v, l in grams1_to_ids.items():
    vecs.append(v)
    labels.append(l[0])
    ids.append(l[1])
    
grams1 = select_dimensions(vecs, 0.9, 0.0005)
print("\n\nnumber of dimensions: ", len(grams1))



Number of instances:  46305
Total number of possible features:  71597
All features occuring in 41674.5 instances or more, and all features occuring in 23.1525 instances or fewer, will be dropped

features above cutoff:
 0
Examples: 20 features that were above the cutoff:
 []

features below cutoff:
 67665
Examples: 20 features that were below the cutoff:
 ['cocaine', 'amppowerbut', 'Snopes', 'FBIDOJ', 'newsso', 'smuggling', 'tremendous', 'daughters', 'therefore', 'McGraw', 'anti2A', 'husband', 'Socialism', 'Patriot', 'Gary', 'Sinise', 'Highest', 'Civilians', 'Proud', 'Honor'] 



Number of remaining features:  3932 


Examples: 20 features remaining:
 ['1', '10', '100', '1000', '10000', '100000', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1st', '2', '20', '200', '2013'] 




number of dimensions:  3932
Wall time: 506 ms


In [107]:
def mutual_information(vecs, grams, labels):
    labels_count = {l:0 for l in labels}
    grams_count = {g:{l:0 for l in labels} for g in grams}
    #print(labels_count)
    #print(grams_count)
    for k, v in vecs.items():
        #print("key: ",k)
        #print("val: ", v)
        vset = set(k)
        label= v[0]
        #print("label: ", label)
        for gram in k:
            if not k in grams_count:
                continue
            grams_count[gram][label] +=1
        labels_count[label] +=1
    
    print("labels count: ", labels_count)
    labels_prop = {l:(labels_count[l]/len(vecs)) for l in labels}
    print("labels_prop: ", labels_prop)
    mi = dict()
    for g, gcounts in grams_count.items():
        gcountsum = sum([x for x in gcounts.values()])
        if gcountsum == 0:
            gcountsum = 1
        #print("gcountsum: ", gcountsum)
        gprops = {l:((gcounts[l]/gcountsum) / labels_prop[l]) for l in labels}
        #print("ggroups: ", gprops)
        mi[g] = gprops
        
    return grams_count, mi
        

In [108]:
gc, mi= mutual_information(vecs_to_ids, sg, ['msm', 'fake'])

labels count:  {'msm': 36042, 'fake': 10236}
labels_prop:  {'msm': 0.7788149876831324, 'fake': 0.22118501231686763}


In [113]:
gc1, mi1 = mutual_information(grams1_to_ids, grams1, ['msm', 'fake'])

labels count:  {'msm': 36066, 'fake': 10239}
labels_prop:  {'msm': 0.7788791707159054, 'fake': 0.2211208292840946}


In [114]:
mi1_top_fake = sorted([x for x in mi1.items()], key=lambda x: x[1]['fake'], reverse=True)
mi1_top_msm = sorted([x for x in mi1.items()], key=lambda x: x[1]['msm'], reverse=True)


In [116]:
print("\n\n".join([str(x) for x in mi1_top_fake[0:1000]]))

('1', {'msm': 0.0, 'fake': 0.0})

('10', {'msm': 0.0, 'fake': 0.0})

('100', {'msm': 0.0, 'fake': 0.0})

('1000', {'msm': 0.0, 'fake': 0.0})

('10000', {'msm': 0.0, 'fake': 0.0})

('100000', {'msm': 0.0, 'fake': 0.0})

('11', {'msm': 0.0, 'fake': 0.0})

('12', {'msm': 0.0, 'fake': 0.0})

('13', {'msm': 0.0, 'fake': 0.0})

('14', {'msm': 0.0, 'fake': 0.0})

('15', {'msm': 0.0, 'fake': 0.0})

('16', {'msm': 0.0, 'fake': 0.0})

('17', {'msm': 0.0, 'fake': 0.0})

('18', {'msm': 0.0, 'fake': 0.0})

('19', {'msm': 0.0, 'fake': 0.0})

('1st', {'msm': 0.0, 'fake': 0.0})

('2', {'msm': 0.0, 'fake': 0.0})

('20', {'msm': 0.0, 'fake': 0.0})

('200', {'msm': 0.0, 'fake': 0.0})

('2013', {'msm': 0.0, 'fake': 0.0})

('2014', {'msm': 0.0, 'fake': 0.0})

('2015', {'msm': 0.0, 'fake': 0.0})

('2016', {'msm': 0.0, 'fake': 0.0})

('2017', {'msm': 0.0, 'fake': 0.0})

('2018', {'msm': 0.0, 'fake': 0.0})

('2019', {'msm': 0.0, 'fake': 0.0})

('2020', {'msm': 0.0, 'fake': 0.0})

('21', {'msm': 0.0, 'fake': 0

In [115]:
print("\n\n".join([str(x) for x in mi1_top_msm[0:1000]]))

('1', {'msm': 0.0, 'fake': 0.0})

('10', {'msm': 0.0, 'fake': 0.0})

('100', {'msm': 0.0, 'fake': 0.0})

('1000', {'msm': 0.0, 'fake': 0.0})

('10000', {'msm': 0.0, 'fake': 0.0})

('100000', {'msm': 0.0, 'fake': 0.0})

('11', {'msm': 0.0, 'fake': 0.0})

('12', {'msm': 0.0, 'fake': 0.0})

('13', {'msm': 0.0, 'fake': 0.0})

('14', {'msm': 0.0, 'fake': 0.0})

('15', {'msm': 0.0, 'fake': 0.0})

('16', {'msm': 0.0, 'fake': 0.0})

('17', {'msm': 0.0, 'fake': 0.0})

('18', {'msm': 0.0, 'fake': 0.0})

('19', {'msm': 0.0, 'fake': 0.0})

('1st', {'msm': 0.0, 'fake': 0.0})

('2', {'msm': 0.0, 'fake': 0.0})

('20', {'msm': 0.0, 'fake': 0.0})

('200', {'msm': 0.0, 'fake': 0.0})

('2013', {'msm': 0.0, 'fake': 0.0})

('2014', {'msm': 0.0, 'fake': 0.0})

('2015', {'msm': 0.0, 'fake': 0.0})

('2016', {'msm': 0.0, 'fake': 0.0})

('2017', {'msm': 0.0, 'fake': 0.0})

('2018', {'msm': 0.0, 'fake': 0.0})

('2019', {'msm': 0.0, 'fake': 0.0})

('2020', {'msm': 0.0, 'fake': 0.0})

('21', {'msm': 0.0, 'fake': 0

In [109]:
mi_top_fake = sorted([x for x in mi.items()], key=lambda x: x[1]['fake'], reverse=True)

In [110]:
print("\n\n".join([str(x) for x in mi_top_fake[0:1000]]))

('0', {'msm': 0.0, 'fake': 0.0})

('0 0', {'msm': 0.0, 'fake': 0.0})

('0 0 trading', {'msm': 0.0, 'fake': 0.0})

('0 0Apple', {'msm': 0.0, 'fake': 0.0})

('0 0Apple confirms', {'msm': 0.0, 'fake': 0.0})

('0 Story', {'msm': 0.0, 'fake': 0.0})

('0 Story w', {'msm': 0.0, 'fake': 0.0})

('0 These', {'msm': 0.0, 'fake': 0.0})

('0 These kids', {'msm': 0.0, 'fake': 0.0})

('0 about', {'msm': 0.0, 'fake': 0.0})

('0 about legal', {'msm': 0.0, 'fake': 0.0})

('0 coverage', {'msm': 0.0, 'fake': 0.0})

('0 coverage And', {'msm': 0.0, 'fake': 0.0})

('0 data', {'msm': 0.0, 'fake': 0.0})

('0 data Net', {'msm': 0.0, 'fake': 0.0})

('0 dead', {'msm': 0.0, 'fake': 0.0})

('0 trading', {'msm': 0.0, 'fake': 0.0})

('0 trading PG', {'msm': 0.0, 'fake': 0.0})

('000 Venezuela', {'msm': 0.0, 'fake': 0.0})

('000 dollars', {'msm': 0.0, 'fake': 0.0})

('000 dollars à', {'msm': 0.0, 'fake': 0.0})

('000 femmes', {'msm': 0.0, 'fake': 0.0})

('000 femmes victimes', {'msm': 0.0, 'fake': 0.0})

('000 filles'

In [111]:
mi_top_msm = sorted([x for x in mi.items()], key=lambda x: x[1]['msm'], reverse=True)

In [112]:
print("\n\n".join([str(x) for x in mi_top_msm[0:1000]]))

('0', {'msm': 0.0, 'fake': 0.0})

('0 0', {'msm': 0.0, 'fake': 0.0})

('0 0 trading', {'msm': 0.0, 'fake': 0.0})

('0 0Apple', {'msm': 0.0, 'fake': 0.0})

('0 0Apple confirms', {'msm': 0.0, 'fake': 0.0})

('0 Story', {'msm': 0.0, 'fake': 0.0})

('0 Story w', {'msm': 0.0, 'fake': 0.0})

('0 These', {'msm': 0.0, 'fake': 0.0})

('0 These kids', {'msm': 0.0, 'fake': 0.0})

('0 about', {'msm': 0.0, 'fake': 0.0})

('0 about legal', {'msm': 0.0, 'fake': 0.0})

('0 coverage', {'msm': 0.0, 'fake': 0.0})

('0 coverage And', {'msm': 0.0, 'fake': 0.0})

('0 data', {'msm': 0.0, 'fake': 0.0})

('0 data Net', {'msm': 0.0, 'fake': 0.0})

('0 dead', {'msm': 0.0, 'fake': 0.0})

('0 trading', {'msm': 0.0, 'fake': 0.0})

('0 trading PG', {'msm': 0.0, 'fake': 0.0})

('000 Venezuela', {'msm': 0.0, 'fake': 0.0})

('000 dollars', {'msm': 0.0, 'fake': 0.0})

('000 dollars à', {'msm': 0.0, 'fake': 0.0})

('000 femmes', {'msm': 0.0, 'fake': 0.0})

('000 femmes victimes', {'msm': 0.0, 'fake': 0.0})

('000 filles'

In [None]:
from sklearn import linear_model
m = linear_model.LogisticRegression()