# 1 Load packages used

In [1]:
# Libraries to be used

# Keras imports
from keras.models import Sequential
from keras.layers import SimpleRNN, Embedding, LSTM, Bidirectional
from keras.layers import Dense, TimeDistributed, Activation, Dropout
from keras import optimizers
from keras import backend as K

# Callbacks for training
from keras.callbacks import EarlyStopping

# Gensim models
from gensim.models import Word2Vec

# Ploting
import matplotlib.pyplot as plt
from matplotlib.pyplot import stem
from sklearn.manifold import TSNE
%matplotlib inline

# Misc imports #
import pandas as pd
import numpy as np
import copy
import math

import re



Using TensorFlow backend.


# 2 Read csv data

In [5]:
twitter_data = pd.read_csv('TrumpTweets.csv')
print("Shape of dataset: "+str(twitter_data.shape))
twitter_data.head(5)

Shape of dataset: (29246, 7)


Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,Wow so many Fake News stories today. No matter...,10-04-2017 11:29:43,9898.0,37312,False,9.155394e+17
1,Twitter for iPhone,A great day in Puerto Rico yesterday. While so...,10-04-2017 10:25:58,5493.0,28436,False,9.155234e+17
2,Twitter for iPhone,My Administration will continue to work around...,10-04-2017 00:53:10,9208.0,41079,False,9.153792e+17
3,Twitter for iPhone,RT @PressSec: .@POTUS and @FLOTUS meet w/ some...,10-04-2017 00:28:24,5631.0,0,True,9.15373e+17
4,Twitter for iPhone,RT @seanhannity: Tonight the truth about how d...,10-04-2017 00:27:11,7427.0,0,True,9.153727e+17


In [45]:
import re
def text_cleanup(text):
    # Define allowed characters in text
    cap = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    low = cap.lower()
    spc = '.,#@/!?:\' '
    nbr = '0123456789'
    allowed_chars = cap + low + spc + nbr
    
    # Iterate though text and keep only allowed characters
    new = ""
    for character in text:
        if character in allowed_chars:
            new += character
        else:
            new += " "
    # Remove consecutive spaces
    new = re.sub(" +"," " ,new)
    # Remove space at end of list
    if new[-1]==" ":
        new = new[0:-1]
    return new

# Formatting text
def text_format(text):
    text = add_space_after(text,"!")
    text = add_space_before(text,"#")
    text = add_space_before(text,"@")
    text = add_space_after(text,",");
    # Remove consecutive spaces
    text = re.sub(" +"," " ,text)
    # Remove space at end of list
    if text[-1]==" ":
        text = text[0:-1]
    return text

# Add space after selected sign in text
def add_space_after(text,sign):
    text = text.replace(sign,sign + " ")
    return text

# Add space before selected sign in text
def add_space_before(text,sign):
    text = text.replace(sign," "+ sign)
    return text

def add_spaces_around(text):
    space = ',!?.'
    for i in space:
        text = text.replace(i," " + i + " ")
    # Remove consecutive spaces
    text = re.sub(" +"," " ,text)
    # Remove space at end of list
    if text[-1]==" ":
        text = text[0:-1]   
    return text
        
# Extract and replace hashtags in list
def remove_hashtags(text):
    # Set hashtag holder
    hashtagholder = "#TAG"
    # Declare variable
    hashtags = [];
    text_out = "";
    # Split text by spaces
    text_vec = text.split(" ")
    
    # For all words in the text vector
    for word in text_vec:
        # If there exist an hashtag
        if word.find("#")>-1:
            # Append hashtag
            hashtags.append(word)
            
            # Replace hashtag with hashtagholder in text
            if len(text_out)<1:
                text_out = hashtagholder
            else:
                text_out = text_out + " " + hashtagholder
        else:
            # Add the word to the text vector
            if len(text_out)<1:
                text_out = word
            else:
                text_out = text_out + " " + word
                
    # Return hashtags and modified text
    return hashtags,text_out

In [46]:
def remove_ats(text):
    # Set @ holder
    atholder = "@PERSON"
    # Initiate variables
    ats = [];
    text_out = "";
    # Split text on spaces
    text_vec = text.split(" ")
    # For each word in text
    for word in text_vec:
        # If an @ exist
        if word.find("@")>-1:
            # Append @
            ats.append(word)
            # Replace at with atholder
            if len(text_out)<1:
                text_out = atholder
            else:
                text_out = text_out + " " + atholder
                
        # Else add the word to the output vector
        else:
            if len(text_out)<1:
                text_out = word
            else:
                text_out = text_out + " " + word
    # Return ats and text        
    return ats,text_out

def remove_links(text):
    # Define link holder
    linkholder = 'HTTPSLINK'
    # Initiate variables
    links = [];
    text_out = "";
    
    
    # Find index where http starts
    st = text.find('http')
    # Find https
    if text.find('http')>-1:

        
        # Extract link part from the remaining text
        link_part = text[st:]
        
        # Split link part on spaces
        lnk = link_part.split(" ")
        
        # For all links in linklist
        for link in lnk:
            # If link is true link, else the link is ignored
            if link.find("http")>-1:
                links.append(link)
        text_out = text[:st-1] + (" " + linkholder)*len(links)
    else:
        text_out = text
    # Add text and linkholders to text
    
    return links,text_out

In [None]:
def to_sentences(raw_text):
    #Input: list of texts(tweets)
    #Output: list of sentences
    length=len(raw_text)
    sentences=[]
    #every line in raw text
    for tweet in range(len(raw_text)):
        #split into sentences
        try:
            tweet_split=raw_text[tweet].split('.')
        except:
            print('Failed on:' ,raw_text[tweet],'index:',tweet)
        #append all sentences
        for sentence in tweet_split:
            sentences.append(sentence)
    return sentences
def to_word_list(filtered_sentences):
    #Input: list of sentences
    #Output: list of unique words
    vocab=[]
    for sentence in filtered_sentences:
        sentence=sentence.lower()
        sentence=sentence.split(' ') 
        for word in sentence:
            if word not in vocab:
                vocab.append(word)
    return vocab

In [None]:


line = 'RT @DonnaWR8: @realDonaldTrump I        wonder what this BRAVE American would give to stand on his OWN two legs just ONCE MORE for our #Anthem?â€¦'
line = 'RT @DonnaWR8: .@POTUS #TRUMPðŸ‡ºðŸ‡¸ &amp; I @FLOTUSðŸŒºWhen ALL seemed HOPELESS...YOU brought HOPE!You INSPIRE us ALL!#MAGA #Harvey @Scavino45 #USAâ€¦'
#line = 'Hillarys Two Official Favors To Morocco Resulted In $28 Million For Clinton Foundation #DrainTheSwamphttps://t.co/6qOO7FZSvF'
#line = 'RT @TwitterData: These are the 10 most Tweeted about world leaders during the first day of #UNGA General Debate https://t.co/HhlOlNAkDJ'
line = '@davidsidol: Great meeting @realDonaldTrump today!. #TrumpGolf #TrumpNationalCharlotte #1stclass http://t.co/KDKPgyANGV'

line = 'It is time to rebuild OUR country to bring back OUR jobs to restore OUR dreams &amp; yes to put #AmericaFirst! TY Oâ€¦ https://t.co/2b2bXwxGkA'
line = 'RT @PERSON Under POTUS @PERSON ??S&amp;P 500 38th?Record High ??NASDAQ 44th?Record High?? #TAG HTTPSLINK'
print(type(line))
line = text_cleanup(line)
print(line)
line = text_format(line);
print(line)
x,line = remove_links(line)
line = add_spaces_around(line)
print(line)
hashtags, line = remove_hashtags(line)
print(line)
ats, line = remove_ats(line)
print(line)



In [47]:
#print(twitter_data)

tweets = twitter_data['text'].values

filter_ats = [];
filter_hashtags =[];
filter_links = [];
filter_tweets = [];
error_index = []
for i in range(len(tweets)):
    
    #if i == 73:
        #print(i)
        tweet = tweets[i]
        if type(tweet) == str:
            tweet_out = text_cleanup(tweet)
            tweet_out = text_format(tweet_out)
            links, tweet_out = remove_links(tweet_out)
            tweet_out = add_spaces_around(tweet_out)
            hashtags, tweet_out = remove_hashtags(tweet_out)
            ats, tweet_out = remove_ats(tweet_out)
            filter_tweets.append(tweet_out)
            filter_ats.append(ats)
            filter_hashtags.append(hashtags)
            filter_links.append(links)
        else:
            filter_tweets.append('')
            filter_ats.append([])
            filter_hashtags.append([])
            filter_links.append([])
            error_index.append(i)
#print(filter_tweets)


In [48]:
# Cerate dataframe with id and labels
print(len(filter_tweets))
print((tweets),len(filter_tweets))
print(error_index)
d = {'Original_tweet': tweets, 'filtered_tweet' : filter_tweets}
df = pd.DataFrame(d)
# Save to csv file
df.to_csv('out.csv', index=False)


29246
[ 'Wow so many Fake News stories today. No matter what I do or say they will not write or speak truth. The Fake News Media is out of control!'
 'A great day in Puerto Rico yesterday. While some of the news coverage is Fake most showed great warmth and friendship.'
 'My Administration will continue to work around the clock with Governor @RicardoRossello &amp; his team. Great progress being made! #PRStrong🇵🇷 https://t.co/1aL9YrwTvC'
 ...,
 'Donald Trump reads Top Ten Financial Tips on Late Show with David Letterman: http://tinyurl.com/ooafwn - Very funny!'
 'Donald Trump will be appearing on The View tomorrow morning to discuss Celebrity Apprentice and his new book Think Like A Champion!'
 'Be sure to tune in and watch Donald Trump on Late Night with David Letterman as he presents the Top Ten List tonight!'] 29246
[28872]


In [None]:
filter_hashtags

In [77]:
dta = twitter_data['is_retweet'].values
a = []
for i in range(len(dta)):
    #print(dta[i])
    if dta[i]=='true':
        a.append(1)
    else:
        a.append(0)


sum(a)


410

In [31]:
a = 'abs\''
for i in a:
    print(i)
print(a)
len(a)

a
b
s
'
abs'


4

In [None]:
ks = twitter_data.keys()
print(ks)

In [None]:
def remove_links(sentence):
    sent = []
    for word in sentence.split(" "):
        a = word.find('http')
        if a:
            sent.append(word)  
    lst = ' '.join(sent)
    return lst

tweets = twitter_data['text'].values

print(tweets[2])
sent = remove_links(tweets[2])
print(sent)


In [None]:
tweets = twitter_data['text'].values
sentences = [];

for i in range(len(tweets)):
    tweet = tweets[i]
    
    try:
        tweet = remove_links(tweet)
        idx = tweet.find(".")
    except:
        print(i)
        continue
    a = tweet.split(".")
    sentences.extend(a)

In [None]:
print(len(sentences))
sentences[10]
print(len(str_list))
sentences[11]