In [23]:
import sys, os, re, gzip, json, pickle, shutil, random, joblib

import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt

from collections import Counter

%matplotlib inline

In [2]:
data_path = '../data'

user_feature_path = data_path + "/user_features.pkl"
troll_feature_path = data_path + "/troll_features.pkl"

In [3]:
def getDataFromPickle(path):
    f = open(path, 'rb')
    return pickle.load(f)

In [4]:
user_feature_list = getDataFromPickle(user_feature_path)
troll_feature_list = getDataFromPickle(troll_feature_path)

In [5]:
# Tweet Length

def tweet_length_average(tag_list):
    count = len(tag_list)
    total_length = 0
    for l in tag_list:
        tweet = l["text"]
        total_length = total_length + len(tweet)
    return total_length/count

In [6]:
print("User Length - "  + str(tweet_length_average(user_feature_list)))
print("Troll Length - "  + str(tweet_length_average(troll_feature_list)))

User Length - 78.6326408
Troll Length - 81.55916532433525


In [7]:
# Content Check

def tweet_ratio(tag_list, content):
    cnt = content.lower()
    count = len(tag_list)
    content_count = 0
    for l in tag_list:
        tweet = l[content]
        content_count = content_count + tweet
    return content_count / count

In [8]:
print("User emoji - "  + str(tweet_ratio(user_feature_list, "emoji_ratio")))
print("Troll emoji - "  + str(tweet_ratio(troll_feature_list, "emoji_ratio")))

User emoji - 4.1637813016123366e-05
Troll emoji - 0.002770416088366957


In [9]:
print("User link - "  + str(tweet_ratio(user_feature_list, "link_ratio")))
print("Troll link - "  + str(tweet_ratio(troll_feature_list, "link_ratio")))

User link - 0.026402727728233225
Troll link - 0.06927286467383098


In [10]:
print("User mentions - "  + str(tweet_ratio(user_feature_list, "user_ratio")))
print("Troll mentions - "  + str(tweet_ratio(troll_feature_list, "user_ratio")))

User mentions - 0.029716354446216883
Troll mentions - 0.034544102036425044


In [11]:
print("User OOV - "  + str(tweet_ratio(user_feature_list, "oov_ratio")))
print("Troll OOV - "  + str(tweet_ratio(troll_feature_list, "oov_ratio")))

User OOV - 0.14487496356507615
Troll OOV - 0.12870352653639627


In [16]:
def tweet_content_count(tag_list, content):
    cnt = content.lower()
    count = len(tag_list)
    content_count = 0
    for l in tag_list:
        tweet = len(l[content])
        content_count = content_count + tweet
    return content_count / count

In [18]:
print("User OOV Count - "  + str(tweet_content_count(user_feature_list, 'oov_words')))
print("Troll OOV Count - "  + str(tweet_content_count(troll_feature_list, 'oov_words')))

User OOV Count - 8.8745992
Troll OOV Count - 7.484375834981029


In [20]:
print("User Hashtag Count - "  + str(tweet_content_count(user_feature_list, 'hashtags')))
print("Troll Hashtag Count - "  + str(tweet_content_count(troll_feature_list, 'hashtags')))

User Hashtag Count - 0.163634
Troll Hashtag Count - 0.6883760143826675


In [31]:
def find_top_10_persons(tag_list):
    diction = {}
    for l in tag_list:
        entities = l['entities']
        for e in entities:
            if e.endswith(":PERSON"):
                word = e.replace(":PERSON", "")
                if word not in diction:
                    diction[word] = 0
                diction[word] = diction[word] + 1
    d = Counter(diction)
    list_person = []
    for k, v in d.most_common(10):
        list_person.append(k)
    
    
    return list_person

In [36]:
print("User Most Mentioned Person - "  + str(find_top_10_persons(user_feature_list)))
print("Troll Most Mentioned Person - "  + str(find_top_10_persons(troll_feature_list)))

User Most Mentioned Person - ['Lol', 'Obama', 'Michael_Jackson', 'Rihanna', 'Sarah_Palin', 'Austin', 'Chris', 'Lady_Gaga', 'Taylor_Swift', 'Wanna']
Troll Most Mentioned Person - ['Hillary', 'Donald_Trump', 'Clinton', 'Trump', 'Obama', 'Hillary_Clinton', 'Putin', 'Bill_Clinton', 'Ted_Cruz', "Donald_Trump_'s"]


In [33]:
def find_top_10_hashtags(tag_list):
    diction = {}
    for l in tag_list:
        hashtags = l['hashtags']
        for h in hashtags:
            if h not in diction:
                diction[h] = 0
            diction[h] = diction[h] + 1
    d = Counter(diction)
    list_hash = []
    for k, v in d.most_common(10):
        list_hash.append(k)
    
    
    return list_hash

In [37]:
print("User Top 10 Hashtag - "  + str(find_top_10_hashtags(user_feature_list)))
print("Troll Top 10 Hashtag - "  + str(find_top_10_hashtags(troll_feature_list)))

User Top 10 Hashtag - ['#39;s', '#fb', '#FF', '#tcot', '#jobs', '#quote', '#followfriday', '#FollowFriday', '#1', '#39;']
Troll Top 10 Hashtag - ['#news', '#sports', '#politics', '#world', '#local', '#TopNews', '#MAGA', '#health', '#BlackLivesMatter', '#tcot']


In [35]:
def find_top_10_gpe(tag_list):
    diction = {}
    for l in tag_list:
        entities = l['entities']
        for e in entities:
            if e.endswith(":GPE"):
                word = e.replace(":GPE", "")
                if word not in diction:
                    diction[word] = 0
                diction[word] = diction[word] + 1
    d = Counter(diction)
    list_person = []
    for k, v in d.most_common(10):
        list_person.append(k)
    
    
    return list_person

In [38]:
print("User GPE Count - "  + str(find_top_10_gpe(user_feature_list)))
print("Troll GPE Count - "  + str(find_top_10_gpe(troll_feature_list)))

User GPE Count - ['US', 'LA', 'Chicago', 'U.S.', 'New_York', 'Obama', 'America', 'San_Diego', 'Texas', 'Seattle']
Troll GPE Count - ['U.S.', 'US', 'America', 'Obama', 'Russia', 'Syria', 'Texas', 'China', 'Iran', 'California']
