In [None]:
from itertools import product
import random
import pickle
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textblob import TextBlob 
import numpy as np
import pandas as pd
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.models import load_model
from keras.utils import plot_model
from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
import sys
import io
from keras.callbacks import ModelCheckpoint
from twython import Twython
from twython import TwythonStreamer
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from datetime import datetime
import time
import random

In [None]:
class PreProcessor:
    
    def __init__(self, noise_chars):
        self.noise_chars = noise_chars #declare the noisy characters, we do not use @ or # as we need them for regex extraction
    
    def removeNoise(self, string):
        """
        This function removes noise from the text by lowering cases, removing character entities
        references, removing links, and removing given noisy characters.
        """
        string = string.lower() #converts the text to lower case 
        string = re.sub('&[a-zA-Z]+;', '', string) #remove character entities reference
        string = re.sub('http\S+', '', string)#remove links
        string = re.sub('www\S+', '', string)
        string = re.sub("\\xa0·", " ", string)
        string = re.sub("(UTC)", " ", string)
        string = re.sub('^b\s+', '', string)
        string = re.sub('@[A-Za-z0-9]+', '', string)
        string = re.sub("â€\x9d&lt;", "", string)
        string = re.sub("â€œ:", "", string)
        string = re.sub('pleas', 'please', string)
        string = re.sub('dont', 'do not', string)
        for char in self.noise_chars:
            string = string.replace(char, '') #removes any noisy character from the list noise_chars
        cleaned = string #the cleaned string is passed and returned
        return cleaned
    
    def removeStopwords(self, string):
        """
        This function removes stopwords from the text. It uses a corpus of stopwords from NLTK and they 
        can be extended.
        """
        words = []
        for i in string.split(" "):#split the text into spaces so it is divided into words
            words.append(i) #each word is appended in the list of words
        stop_words = stopwords.words('english') #the stopwords corpus provided by NLTK is used for remove them
        stop_words.extend(['that','thats','oh', 'aww', 'mr', 'r', 'what', 'etc', 'hey', 'within', 'foi', 'yeah', 'www', 'wa', 'em', 'am', 'i', 'me', 'dialmformurderjpg' ]) #we can extend the list of stopwords in this line
        cleaned = [w for w in words if w not in stop_words] #each word in the list of words is checked for stopwords in the corpus
        cleaned_stopwords = " ".join(cleaned)
        return cleaned_stopwords
    
    def textNormalization(self, string):
        """
        This function normalizes text by reducing length of letters in words and correcting spelling of words.
        """
        normalized = []
        tokenizer = nltk.tokenize.TweetTokenizer() #we use the TweetTokenizer to reduce length of letters in the words
        len_reduced = tokenizer.tokenize(string) #the function is applied to the text returning the length reduced
        for word in len_reduced:
            check_spell = TextBlob(word) #we use TextBlob spelling checking as the minimum lenght applied by TweetTokenizer is three letters and there can be some mispelled words
            normalized.append(str(check_spell.correct())) #it is returned the correct spell of the word and appended to normalized list of words
        normalized = " ".join(normalized)
        return normalized
    
    def stemWords(self, string):
        """
        This function performs stemming of words by chopping off inlfections of words using Port Stemmer algorithm.
        Also it corrects mispelling of words.
        """
        words = []
        stemmed = []
        for i in string.split(" "):
            words.append(i)
        stemmer = PorterStemmer() #it is used PorterStemmer to reducing inflection of words to their original word form
        stemmed_words = [stemmer.stem(w) for w in words] #it is applied to each word
        for word in stemmed_words:
            check_spell = TextBlob(word) #again, we use spelling checking as some words might result mispelled or not complete
            stemmed.append(str(check_spell.correct()))
        stemmed = " ".join(stemmed)
        return stemmed
    
    def lemmatizeWords(self, string):
        """
        This function performs word lemmatization by transforming inflections of words in their root form using
        WordNetLemmatization from NLTK. Also it checks mispelling of words.
        """
        words = []
        lemmatized = []
        for i in string.split(" "):
            words.append(i)
        lemmatizer = WordNetLemmatizer() #it is used WordNetLemmatizer to transform word inflections to root form. It is similar to stemming but it does not just chop off words.
        lemmatized_words = [lemmatizer.lemmatize(w) for w in words] #it is applied to each word
        for word in lemmatized_words:
            check_spell = TextBlob(word) #again, we use spelling checking as some words might result mispelled or not complete
            lemmatized.append(str(check_spell.correct()))
        lemmatized = " ".join(lemmatized)
        return lemmatized
    
    def wordTokenize(self, string):
        """
        This function performs word tokenization using regular expressions.
        """
        regex = "[a-zA-Z]+" #this is another form to split the sentences into words by using regular expressions
        tokenized = re.findall(regex, string) #it finds all the matching cases of the regex in the string text and it return a list of words
        return tokenized

    
    def process(self, string):
        """
        This function summarizes and apply all the preprocessing tasks.
        """
        cleaned = self.removeNoise(string)
        cleaned_stopwords = self.removeStopwords(cleaned)
        normalized = self.textNormalization(cleaned_stopwords)
        stemmed = self.stemWords(normalized)
        #lemmatized = self.lemmatizeWords(normalized)
        tokenized = " ".join(self.wordTokenize(stemmed))
        cleaned = self.removeNoise(tokenized)
        preprocessed = self.removeStopwords(cleaned)
        return preprocessed

In [None]:
class Classifier:
  def __init__(self, pre_processor = None, max_features=20000, maxlen=100):
    self.preProcessor = pre_processor if pre_processor else PreProcessor("#@,.?!¬-\''=()") #this calls the PreProcessor class
    self.max_features = max_features
    self.maxlen = maxlen

  def prepare_data(self, X):
    """
    This function prepares the data by performing preprocessing and vectorization.
    """
    try: #try if the data is more than 1 record
      preprocessed = []
      for comment in X:
          preprocessed.append(self.preProcessor.process(comment))
      pickle.dump(preprocessed, open('preprocessed_data.pickle','wb')) #save preprocessed comments in a pickle file
      data_prepared = self.vectorize(preprocessed)
    except: #do if it is only 1 record
      preprocessed = self.preProcessor.process(X)
      data_prepared = self.vectorize(preprocessed)

    return data_prepared

  def vectorize(self, X):
    """
    This function vectorizes the preprocessed data.
    """
    list_sentences = X
    tokenizer = text.Tokenizer(num_words=self.max_features)
    tokenizer.fit_on_texts(list(list_sentences))
    list_tokenized = tokenizer.texts_to_sequences(list_sentences) #receive text and return a sequence (indexes of the words)
    X_vector = sequence.pad_sequences(list_tokenized, maxlen=self.maxlen) #pads with zeros to fulfill the maxlen if not reached
    
    return X_vector

  def get_model(self):
    """
    A Keras model consists of:
    An architecture, or configuration, which specifies what layers the model contain, and how they're connected.
    A set of weights values (the "state of the model").
    An optimizer state (defined by compiling the model).
    A set of losses and metrics (defined by compiling the model).
    """
    #define the architecture of the neural network
    
    embed_size = 128
    inp = Input(shape=(self.maxlen, )) #input layer
    x = Embedding(self.max_features, embed_size)(inp) #useful for NLP tasks
    x = Bidirectional(LSTM(50, return_sequences=True))(x) #50 neurons; bidirectional used for give information backwards
    x = GlobalMaxPool1D()(x) #to get highest activation of the previous layer
    x = Dropout(0.1)(x) #Regularization strategy
    x = Dense(50, activation="relu")(x) #fully connected  
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x) #1 neuron for only one class (toxic or non-toxic), change according to the number of classes 
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

  def predict(self, X):
    """
    This function performs prediction of data by loading a saved model.
    """
    #X_vector = self.prepare_data(X)
    X_vector = X
    #self.model = load_model('classificationModel_compiled')
    self.model = self.get_model()
    self.model.load_weights('weights_base.best.hdf5') #load best weights obtained with pre-trained model
    return self.model.predict(X_vector)

  def summarize(self):
    """
    This model summirizes the model.
    """
    model = self.get_model()
    model.summary()

  def evaluate(self, X, y):
    #X_vector = self.prepare_data(X)
    X_vector = X
    #self.model = load_model('./drive/MyDrive/classificationModel_compiled')
    self.model = self.get_model()
    self.model.load_weights('weights_base.best.hdf5')
    loss, acc = self.model.evaluate(X_vector, y, verbose=2)
    print('Restored model, accuracy: {:5.2f}%'.format(100 * acc))

  def plot_history(self, history):
    """
    This function plots both accuracy and loss history during epochs of the training process.
    """
    # summarize history for accuracy
    self.history = history
    plt.plot(self.history.history['accuracy'])
    plt.plot(self.history.history['val_accuracy'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    # summarize history for loss
    plt.plot(self.history.history['loss'])
    plt.plot(self.history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()

  def plot_model(self):
    """
    This function saves a plot of the model.
    """
    plot_model(self.get_model())

In [None]:
class Generative:
    def __init__(self):
        pass
    
    
    def get_info(self, text):
        text = text
        maxlen = 40
        step = 3
        sentences = []
        for i in range(0, len(text) - maxlen, step):
            sentences.append(text[i: i + maxlen])
        chars = pickle.load(open('chars.pickle','rb'))   
        char_indices = pickle.load(open('char_indices.pickle','rb'))
        indices_char = pickle.load(open('indices_char.pickle','rb'))
        return text, chars, char_indices, indices_char, maxlen, sentences
  
    def sample(self, preds, temperature=0.2):
        """
        This function picks the next character based on the probability distribution. 
        """
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, preds, 1)
        return np.argmax(probas)

    
    def generate_tweets(self, corpus, char_to_idx, idx_to_char, chars, maxlen, n_tweets=10, verbose=1):
        self.model = load_model('GenerativeModel_compiled_v2')
        self.model.load_weights('weights_E_v2.hdf5')
        global tweets
        tweets = []
        
        for i in range(1, n_tweets + 1):
            begin = random.randint(0, len(corpus) - maxlen - 1)
            tweet = u''
            sequence = corpus[begin:begin + maxlen]
            tweet += sequence
            if verbose:
                print('Tweet no. %03d' % i)
                print('=' * 13)
                print('Generating with seed:')
                print(sequence)
                print('_' * len(sequence))
            for _ in range(100):
                x = np.zeros((1, maxlen, len(chars)))
                for t, char in enumerate(sequence):
                    x[0, t, char_to_idx[char]] = 1.0

                preds = self.model.predict(x, verbose=0)[0]
                next_idx = generative.sample(preds)
                next_char = idx_to_char[next_idx]

                tweet += next_char
                sequence = sequence[1:] + next_char
            if verbose:
                print(tweet)
                print()
            tweets.append(tweet)
        
        twitter.update_status(status=tweets[random.randrange(10)]) #sends the generated tweet to Twitter

       
        return tweets

In [None]:
def do_geocode(address):
  """
  This function decodes the user-location of the tweet and performes reverse geocoding for obtaining latitute and longitude.
  """
    geopy = Nominatim(user_agent='Tweet_locator')
    try:
        location = geopy.geocode(address,exactly_one=True, language='en')
        if location is None:
            return None
        else:
          location_exact = geopy.reverse([location.latitude, location.longitude], language='en')
          country_code = location_exact.raw['address']['country_code']
          #return location_exact
          return location.latitude, location.longitude
    except GeocoderTimedOut:
        #return do_geocode(address)
        #return None
        pass

In [None]:
def remove_html(string):
  """
  This function removes html tags.
  """
  #html = re.compile('s/<[a-zA-Z\/][^>]*>//g')
    clean = re.sub('<.*?>', '', string)
    return clean

In [None]:
def to_datetime(dtime):
  """
  This function transforms the datetime of the twitter to datetime format.
  """
    new_datetime = datetime.strftime(datetime.strptime(dtime,'%a %b %d %H:%M:%S +0000 %Y'), '%Y-%m-%d %H:%M:%S')
    return new_datetime

In [None]:
consumer_key= ''
consumer_secret= ''
access_token= ''
access_token_secret= ''

In [None]:
preprocessor = PreProcessor("#@,.?!¬-\''=()")
classifier = Classifier()
generative = Generative()

In [None]:
class MyStreamer(TwythonStreamer):
  
  def on_success(self, data):

    if 'extended_tweet' in data:
        
        created_at = data['created_at']
        date_time = to_datetime(created_at)
        print(date_time)
        text = data['extended_tweet']['full_text']
        user_location = data['user']['location']
        location = do_geocode(user_location) if do_geocode(user_location) is not None else None
        print(location)
        source = data['source']
        clean_source = remove_html(source)
        print(clean_source)
        text_prep = preprocessor.process(text)
        text_vectorized = classifier.vectorize(text_prep)
        prediction = classifier.predict(text_vectorized)
        class_prediction = (prediction > 0.5).astype("int32")
        if class_prediction.any() == 1:
            text, chars, char_indices, indices_char, maxlen, sentences = generative.get_info(text_prep)
            tweets = generative.generate_tweets(text, char_indices, indices_char,chars, maxlen)
        
        #stream.disconnect()
        #time.sleep(4)

        
    
    def on_error(self, status_code, data):
        print(status_code)
        return False

In [None]:
twitter = Twython(
    consumer_key,
    consumer_secret,
    access_token,
    access_token_secret
)

In [None]:
stream = MyStreamer(
    consumer_key,
    consumer_secret,
    access_token,
    access_token_secret
)
stream.statuses.filter(track='#covid19', language = "en", mode="extended")

2020-12-04 01:12:20
(32.3293809, -83.1137366)
Twitter for Android
Tweet no. 001
Generating with seed:
hip god sing god thursdaythought scriptu
________________________________________
hip god sing god thursdaythought scripture contact coronaviru lockdown shorn active said protect social distand mandatory face mass children

Tweet no. 002
Generating with seed:
 worship god sing god thursdaythought sc
________________________________________
 worship god sing god thursdaythought school stay person show stop mass stay people state show policy distride stay people state coronaviru 

Tweet no. 003
Generating with seed:
ursdaythought scripture tramp go nancype
________________________________________
ursdaythought scripture tramp go nancypear coronaviru lockdown showner say perfume show ready continue lockdown coronaviru video contact tra

Tweet no. 004
Generating with seed:
daythought scripture tramp go nancypelos
________________________________________
daythought scripture tramp go nancyp