# Generate Tweets using Deep Learning

# Frame

India recently demonetized the Rs.500 and Rs.1000 notes. People, in large numbers, tweeted using #demonetisation tag. Can we use those tweets to generate a new tweet - one that could go viral ? 

# Acquire

Data was obtained using `tweezer`

This code is modified from [here](https://github.com/rouseguy/DeepLearningNLP_Py/blob/master/notebooks/6.%20Generate%20Tweets%20%23europython.ipynb)

In [27]:
#import required libraries
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import pandas as pd
import pymongo
from pymongo import MongoClient
import json

In [5]:
client = MongoClient()

In [6]:
db = client.tweets

In [7]:
collection = db.demonetization

In [17]:
df = pd.DataFrame(list(collection.find()))

In [18]:
df.columns

Index(['_id', 'contributors', 'coordinates', 'created_at', 'entities',
       'extended_entities', 'favorite_count', 'favorited', 'geo', 'id',
       'id_str', 'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_status_id_str', 'in_reply_to_user_id',
       'in_reply_to_user_id_str', 'is_quote_status', 'lang', 'metadata',
       'place', 'possibly_sensitive', 'quoted_status', 'quoted_status_id',
       'quoted_status_id_str', 'retweet_count', 'retweeted',
       'retweeted_status', 'source', 'text', 'truncated', 'user'],
      dtype='object')

In [19]:
df.head()

Unnamed: 0,_id,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,58465e07f63f6071f5091ac4,,,Fri Dec 02 23:58:29 +0000 2016,"{'hashtags': [{'indices': [105, 120], 'text': ...",,0,False,,804837173493108738,...,,,,92,False,"{'lang': 'en', 'in_reply_to_status_id_str': No...","<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @dhume: .@prasannara: Modi may be the only ...,False,"{'lang': 'en', 'profile_text_color': '333333',..."
1,58465e07f63f6071f5091ac5,,,Fri Dec 02 23:57:50 +0000 2016,"{'hashtags': [{'indices': [17, 32], 'text': 'D...",,0,False,,804837006513672192,...,,,,3,False,"{'lang': 'en', 'in_reply_to_status_id_str': No...","<a href=""https://roundteam.co"" rel=""nofollow"">...",RT @datta_pavan: #Demonetization #Humor #Story...,False,"{'lang': 'en', 'profile_text_color': '000000',..."
2,58465e07f63f6071f5091ac6,,,Fri Dec 02 23:51:40 +0000 2016,"{'hashtags': [{'indices': [105, 120], 'text': ...",,0,False,,804835454109372416,...,,,,92,False,"{'lang': 'en', 'in_reply_to_status_id_str': No...","<a href=""http://twitter.com/download/android"" ...",RT @dhume: .@prasannara: Modi may be the only ...,False,"{'lang': 'en', 'profile_text_color': '790C30',..."
3,58465e07f63f6071f5091ac7,,,Fri Dec 02 23:46:12 +0000 2016,"{'hashtags': [{'indices': [0, 9], 'text': 'Ngu...",,0,False,,804834080277729280,...,,8.036114e+17,8.036113808886497e+17,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",#Ngultrum- few #BorderAreas coping capacity to...,False,"{'lang': 'en', 'profile_text_color': '333333',..."
4,58465e07f63f6071f5091ac8,,,Fri Dec 02 23:45:28 +0000 2016,"{'hashtags': [{'indices': [0, 15], 'text': 'De...",,0,False,,804833896252788736,...,,,,0,False,,"<a href=""http://www.facebook.com/twitter"" rel=...",#DeMonetization proves that Money is Just A No...,False,"{'lang': 'en', 'profile_text_color': '333333',..."


In [20]:
df.text.head()

0    RT @dhume: .@prasannara: Modi may be the only ...
1    RT @datta_pavan: #Demonetization #Humor #Story...
2    RT @dhume: .@prasannara: Modi may be the only ...
3    #Ngultrum- few #BorderAreas coping capacity to...
4    #DeMonetization proves that Money is Just A No...
Name: text, dtype: object

In [21]:
#Extract only those tweets whose retweets + favorites are greater than 100

In [24]:
df_viral = df[(df.retweet_count + df.favorite_count)>100].text

# Refine

In [28]:
import re

#The following is done to tokenize the tweets into its appropriate form
#In particular, we try to capture some emoticons, HTML tags, Twitter @usernames (@-mentions), Twitter #hashtags, 
#URLs, numbers, words with and without dashes and apostrophes

#Source : https://marcobonzanini.com/2015/03/09/mining-twitter-data-with-python-part-2/
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [35]:
#Converting JSON format to a string containing all the tweets, which is then used to train and generate text


complete_tweets = ""
text = ''
for tweet in df_viral:        
    tokens = preprocess(tweet)
        
    for index,element in enumerate(tokens):
            
        #Removing '#' 
        if('#' in element):
                
            del tokens[index]
            text = text + ""
            continue
            
            
        #Removing the 'RT' tag
        elif('RT' in element):
                
            del tokens[index]
            text = text + ""
            continue
            
        #This character usually follows the 'RT' tag, so we remove it
        elif(':' in element):
                
            del tokens[index]
            text = text + ""
            continue
                
        text = text + " " + tokens[index]
    text = text + '\n'

In [37]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 146


In [38]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 120
step = 1
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

nb sequences: 858085


In [39]:
print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Vectorization...


In [40]:
# build the model: 2 stacked LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [41]:
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [42]:
%%time

# train the model, output generated text after each iteration
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(X, y, batch_size=128, nb_epoch=2)

    start_index = random.randint(0, len(text) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        print('\nGenerated')
        sys.stdout.write(generated)

        for i in range(20):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 1
Epoch 1/2
 36224/858085 [>.............................] - ETA: 3257s - loss: 2.5988

KeyboardInterrupt: 