In [1]:
%matplotlib inline

import matplotlib.pyplot as plt

In [2]:
import string
import gzip
import json
import re
import dill

In [3]:
import numpy as np
import pandas as pd

In [4]:
import scipy
from scipy import interpolate

In [5]:
import sklearn.cluster
import sklearn.feature_extraction 
import sklearn.feature_extraction.text
import sklearn.metrics
import sklearn.preprocessing

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import auc
from sklearn.metrics import average_precision_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [6]:
import imblearn.over_sampling
from imblearn.over_sampling import SMOTE

In [7]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as VS

[nltk_data] Downloading package stopwords to /home/clb617/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [8]:
import spacy

nlp = spacy.load('en_core_web_sm')

In [9]:
# PATHS & CONSTANTS
#==============================================================================
DATA_PATH = '/scratch/olympus/projects/lead_follow_state_legislation/'

In [10]:
policy_tweets_ = pd.read_csv(
    '{}/machine_learning/training_datasets/emily_ar_data_combined.csv'.format(DATA_PATH), 
    dtype = {
        'text':str, 
        'political_bin':int, 
        'source':str,
        'clean_text':str
    }
)

print("Original Size:", policy_tweets_.shape)

policy_tweets = policy_tweets_[policy_tweets_.text.str.len() > 20]
policy_tweets.index = list(range(policy_tweets.shape[0]))
print("Restricted Size:", policy_tweets.shape)

policy_tweet_texts = policy_tweets["text"]
policy_y_data = policy_tweets["political_bin"]

Original Size: (59810, 4)
Restricted Size: (59740, 4)


In [11]:
topic_tweets_ = pd.read_csv(
    '{}/machine_learning/training_datasets/MEDIA-to-train-and-predict.csv'.format(DATA_PATH), 
    dtype = {
        'clean_text_cnn':str, 
        'coder':str, 
        'dataset':str,
        'tweet_id':str, 
        'text':str,
    },
)

print("Original Size:", topic_tweets_.shape)

Original Size: (2623346, 9)


In [12]:
cross_table = {
    '1': 0, 
    '10': 9, 
    '12': 10, 
    '13': 11,
    '14': 12, 
    '15': 13, 
    '16': 14, 
    '17': 15, 
    '18': 16,
    '19': 17, 
    '2': 1, 
    '20': 18, 
    '21': 19, 
    '23': 20,
    '3': 2, 
    '4': 3, 
    '5': 4, 
    '6': 5, 
    '7': 6, 
    '8': 7, 
    '9': 8
}

mt_list = [x for x in list(set([int(x) for x in list(cross_table.keys())]))]
topic_labels = ['Economy', 'Civil Rights','Health','Agriculture',
                'Labor and Employment', 'Education', 'Environment', 
                'Energy', 'Immigration', 'Transportation', 'Law and Crime',
                'Social Welfare', 'Housing', 'Finance & Domestic Commerce',
                'Defense', 'Science, Tech and Communications', 'Foreign Trade',
                'International Affairs and Aid', 'Government Operations',
                'Public Lands and Water', 'Arts and Entertainment']

topic_label_index = dict(zip(mt_list, topic_labels))

In [13]:
topic_counts = pd.DataFrame(topic_tweets_["majortopic01"].value_counts())
topic_counts["label"] = [topic_label_index.get(int(i), "unknown") for i in topic_counts.index]

topic_counts

Unnamed: 0,majortopic01,label
20.0,68061,Government Operations
16.0,55569,Defense
21.0,49245,Public Lands and Water
3.0,39191,Health
19.0,33471,International Affairs and Aid
12.0,32474,Law and Crime
15.0,31535,Finance & Domestic Commerce
1.0,28497,Economy
10.0,25887,Transportation
18.0,20186,Foreign Trade


In [14]:
target_topic = 12

In [15]:
topic_tweets = topic_tweets_[topic_tweets_.dataset.isin(["all_tweets", "tweets", "ar-db", "emily-db"])]
topic_tweets = topic_tweets[topic_tweets.text.str.len() > 20]
topic_tweets = topic_tweets[topic_tweets["majortopic01"] > 0]

topic_tweets.index = list(range(topic_tweets.shape[0]))
print("Restricted Size:", topic_tweets.shape)

Restricted Size: (46620, 9)


In [16]:
topic_tweet_texts = topic_tweets["text"]
topic_y_data = pd.Series([1 if int(x) == target_topic else 0 for x in topic_tweets["majortopic01"]])

In [17]:
# But first, read in stopwrods
enStop = stopwords.words('english')

# Skip stop words, retweet signs, @ symbols, and URL headers
stopList = enStop +\
    ["http", "https", "rt", "@", ":", "t.co", "co", "amp", "&amp;", "...", "\n", "\r"]
stopList.extend(string.punctuation)

In [18]:
# # def tokenizer_wrapper(text):
# #     return [t.lemma_ for t in nlp(text)]

local_tokenizer = TweetTokenizer()
def tokenizer_wrapper(text):
    return local_tokenizer.tokenize(text)

In [19]:
# Generate Additional Features
sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

## Taken from Davidson et al.
def other_features(tweet_text):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet_text)
    
    words = local_tokenizer.tokenize(tweet_text) #Get text only
    
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet_text)
    num_terms = len(tweet_text.split())
    num_words = len(words)
    num_unique_terms = len(set([x.lower() for x in words]))
    
    caps_count = sum([1 if x.isupper() else 0 for x in tweet_text])
    caps_ratio = caps_count / num_chars_total
    
    twitter_objs = count_twitter_objs(tweet_text) #Count #, @, and http://

    features = [num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], 
                sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0]
               ]

    return features

other_features_names = ["num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos",
                        "vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", 
                        "num_urls"
                       ]

In [20]:
vectorizer_ = joblib.load("/home/clb617/Development/trecis/models/2013to2016_tfidf_vectorizer_20190109.pkl")

# tmp_vect = vectorizer
tmp_vect = vectorizer_



In [21]:
vocab = {v:i for i, v in enumerate(tmp_vect.get_feature_names())}
idf_vals = tmp_vect.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

In [22]:
r_state = 1337

np.random.seed(r_state)

In [23]:
policy_tfidf = tmp_vect.transform(policy_tweet_texts).toarray()
policy_other_ftr_data = np.array([other_features(tweet) for tweet in policy_tweet_texts])

In [24]:
policy_X_data = np.concatenate([
    policy_tfidf, 
    policy_other_ftr_data, 
], axis=1)

In [25]:
policy_nb_params = {
    'alpha': 0.05134305647695325,
    'binarize': 0.045909955637688404,
    'fit_prior': True
}

In [26]:
policy_fitted_model = BernoulliNB(**policy_nb_params)
policy_fitted_model.fit(policy_X_data, policy_y_data)

BernoulliNB(alpha=0.05134305647695325, binarize=0.045909955637688404,
      class_prior=None, fit_prior=True)

In [27]:
# Convert JSON to a tweet dict, returning None on fail for filtering
def json_str_to_tweet(json_str):
    tweet = None
    try:
        tweet = json.loads(json_str)
    except:
        pass
    
    return tweet

def get_tweet_text(tweet):
    text = None
    if ( "full_text" in tweet ):
        text = tweet["full_text"]
    else:
        text = tweet["text"]
    
    return (tweet["id"], text)

In [28]:
vect_broad = sc.broadcast(tmp_vect)

In [29]:
# from utils.tokenizer import tokenizer_wrapper as tw

In [61]:
def vectorize_tweets(local_texts):
    
    local_vect = vect_broad.value
    vect_results = local_vect.transform(local_texts).toarray()
    
    other_ftr_data = np.array([other_features(tweet) for tweet in local_texts])
    
    local_X_data = np.concatenate([
        vect_results, 
        other_ftr_data, 
    ], axis=1)
    
    return local_X_data

def classify_tweets(tweet_iterator, clf):
    
    tweet_ids = []
    local_texts = []
    for tid, ttext in tweet_iterator:
        tweet_ids.append(tid)
        local_texts.append(ttext)
    
    if ( len(tweet_ids) > 0 ):
        local_X_data = vectorize_tweets(local_texts)
        labels = clf.predict(local_X_data).tolist()
    
        return zip(tweet_ids, labels)
    
    else:
        return iter([])


In [None]:
poli_sample_rdd_ = sc.textFile("usertimeline/random_us_timelines_sample")\
    .map(json_str_to_tweet)\
    .filter(lambda t: t is not None)\
    .map(get_tweet_text)\
    .mapPartitions(lambda i: classify_tweets(i, policy_fitted_model))

In [None]:
poli_sample_ = poli_sample_rdd_.collect()

In [None]:
policy_labels_df = pd.DataFrame(poli_sample_, columns=["tweet_id", "label"])
policy_labels_df.to_csv("policy_labels_twitter_political.csv", index=False)

In [None]:
policy_labels_df["label"].value_counts()

In [31]:
poli_sample_rdd_ = sc.textFile("usertimeline/random_us_timelines_sample")\
    .map(json_str_to_tweet)\
    .filter(lambda t: t is not None)\
    .map(get_tweet_text)\
    .mapPartitions(lambda i: classify_tweets(i, policy_fitted_model))

In [32]:
poli_sample_ = poli_sample_rdd_.collect()

In [33]:
policy_labels_df = pd.DataFrame(poli_sample_, columns=["tweet_id", "label"])
policy_labels_df.to_csv("policy_labels_twitter_random.csv", index=False)

In [36]:
policy_labels_df["label"].value_counts()

0    6142464
1    4386260
Name: label, dtype: int64

In [31]:
# Read CSV of tweets
ira_df = pd.read_csv("/scratch/olympus/projects/elections_integrity/ira/tweets/ira_tweets_csv_hashed_orig.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
[(t[1], t[2]) for t in ira_df[["tweetid", "tweet_text"]].head().itertuples()]

[(877919995476496385,
  'RT @ruopentwit: ⚡️У НАС НОВОЕ ВИДЕО! Американец: "Если бы не 27 миллионов русских, я бы сейчас говорил по-немецки" https://t.co/mAcCirn4o1…'),
 (492388766930444288, 'Серебром отколоколило http://t.co/Jaa4v4IFpM'),
 (719455077589721089, '@kpru С-300 в Иране https://t.co/elnu3qLUW7'),
 (536179342423105537,
  'Предлагаю судить их за поддержку нацизма, т.к. они отказались его осуждать!! #STOPNazi'),
 (841410788409630720,
  'Предостережение американского дипломата https://t.co/fKPBVgIoVc')]

In [35]:
ira_sample_rdd_ = sc.parallelize(
    [(t[1], t[2]) for t in ira_df[["tweetid", "tweet_text"]].dropna().itertuples()],
    512
)

In [None]:
ira_poli_labels = ira_sample_rdd_\
    .mapPartitions(lambda i: classify_tweets(i, policy_fitted_model))\
    .collect()

In [37]:
ira_policy_labels_df = pd.DataFrame(ira_poli_labels, columns=["tweet_id", "label"])
ira_policy_labels_df.to_csv("policy_labels_twitter_ira.csv", index=False)

In [38]:
ira_policy_labels_df["label"].value_counts()

1    7108631
0    1932675
Name: label, dtype: int64

## Apply This To Reddit

In [39]:
# Convert JSON to a dict, returning None on fail for filtering
def json_str_to_reddit_sub(json_str):
    submission = None
    try:
        submission = json.loads(json_str)
        
        # Throw away comments. All submissions should have an is_self field
        if ( "is_self" not in submission ):
            submission = None
    except:
        pass
    
    return submission


In [47]:
def get_reddit_text(sub):
    sub_id = sub["id"]
    text = sub["title"]
    
    if text == "[deleted]":
        text = ""
        
    if ( "selftext" in sub and len(sub["selftext"]) > 0 and sub["selftext"] != "[deleted]" ):
        text = text + ". " + sub["selftext"]

    return (sub_id, text)

In [63]:
for k, p in [
    ("policy_labels_reddit_random.csv", "usertimeline/reddit_random_sample"),
    ("policy_labels_reddit_poli.csv", "usertimeline/reddit_political_sample"),
    ("policy_labels_reddit_ira.csv", "reddit_troll_submissions"),
]:
    print("Processing:", p)

    local_sample_rdd = sc.textFile(p)\
        .map(json_str_to_reddit_sub)\
        .filter(lambda t: t is not None)\
        .map(get_reddit_text)\
        .filter(lambda t: len(t[1]) > 0)\
        .mapPartitions(lambda i: classify_tweets(i, policy_fitted_model))

    local_sample_ = local_sample_rdd.collect()
    
    policy_labels_df = pd.DataFrame(local_sample_, columns=["sub_id", "label"])
    policy_labels_df.to_csv(k, index=False)
    
    print(policy_labels_df["label"].value_counts())

Processing: reddit_troll_submissions
1    9121
0    4767
Name: label, dtype: int64


In [69]:
sc.stop()

In [None]:
policy_labels_df["url"] = policy_labels_df["sub_id"].apply(lambda x: "http://redd.it/" + x)

In [68]:
policy_labels_df

Unnamed: 0,sub_id,label,url
0,k2446,0,http://redd.it/k2446
1,k3s6p,0,http://redd.it/k3s6p
2,k7dmo,0,http://redd.it/k7dmo
3,kc026,1,http://redd.it/kc026
4,kcz44,1,http://redd.it/kcz44
5,ki807,1,http://redd.it/ki807
6,sfp8h,1,http://redd.it/sfp8h
7,v22nt,1,http://redd.it/v22nt
8,wr1in,1,http://redd.it/wr1in
9,10bzl6,0,http://redd.it/10bzl6


In [70]:
ira_df.dtypes

tweetid                       int64
userid                       object
user_display_name            object
user_screen_name             object
user_reported_location       object
user_profile_description     object
user_profile_url             object
follower_count                int64
following_count               int64
account_creation_date        object
account_language             object
tweet_language               object
tweet_text                   object
tweet_time                   object
tweet_client_name            object
in_reply_to_tweetid         float64
in_reply_to_userid           object
quoted_tweet_tweetid        float64
is_retweet                     bool
retweet_userid               object
retweet_tweetid             float64
latitude                    float64
longitude                   float64
quote_count                 float64
reply_count                 float64
like_count                  float64
retweet_count               float64
hashtags                    