In [3]:
# utilities 
import re
import pickle
import pandas as pd
import numpy as np
from collections import Counter
import logging
import time

# plotting
import seaborn as sns
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Word2vec
import gensim

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report

# show all
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [4]:
# Import Data
DATASET_ENCODING = "iso-8859-1"
dataset = pd.read_csv('MMA_tweets_data.csv'
                      , encoding=DATASET_ENCODING)

# Removing the unnecessary columns and Storing data in lists
text = dataset["Tweet"].tolist()
text

['The Notorious Tiger @beatsbydre! \nThank you Jimmy and Dre ð\x9f\x90\x85 ð\x9f\x8e§ https://t.co/f5El6alw4U',
 'I am honored to announce I have re-signed with @monsterenergy! \nThey have had my back through tick and thin and in particular @hansmolenkamp! \nA man who has helped build this brand toâ\x80¦ https://t.co/8r02g7ryDH',
 'The 6 God and The 12 animal! https://t.co/S1KfNlwam3',
 '2 more days @ Las Vegas, Nevada https://t.co/n6TX5FRHEt',
 'Play as me in #EAUFC3 and bring the belt back to Dagestan. InshAllah #EAathlete #EAUFC3 #AndStill https://t.co/ltXkzsdIzs',
 'Wednesday \n#ufc229 #Ð¿Ð¾Ð±ÐµÐ´Ð°Ñ\x82Ð¾Ð»Ñ\x8cÐºÐ¾Ð¾Ñ\x82Ð\x92Ñ\x81ÐµÐ²Ñ\x8bÑ\x88Ð½ÐµÐ³Ð¾ \nð\x9f\x93¸ frujuice_athletics https://t.co/6oi1e8Rcgt',
 'THE ð\x9f\x91\x91 IS BACK BABY! ð\x9f\x87®ð\x9f\x87ª #UFC229 #EAUFC3 https://t.co/XmWQkbHd2h https://t.co/wBxgs59rnb',
 'Tuesday \n#ufc229 @ Las Vegas, Nevada https://t.co/iTwlWupe7N',
 'The best tasting recovery supplement Iâ\x80\x99ve had. Fuel Your Greatness like the C

In [5]:
# Defining set containing all stopwords in english.
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [6]:
def processing(textdata):
    processedText = []
    
    # Create Lemmatizer and Stemmer.
    wordLemm = WordNetLemmatizer()
    
    # Defining regex patterns.
    urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|( www\.)[^ ]*)"
    userPattern       = '@[^\s]+'
    alphaPattern      = "[^a-zA-Z0-9]"
    sequencePattern   = r"(.)\1\1+"
    seqReplacePattern = r"\1\1"
    
    for tweet in textdata:
        tweet = tweet.lower()
        
        # Replace all URls with 'URL'
        tweet = re.sub(urlPattern,' URL',tweet)
        # Replace all emojis.
        for emoji in emojis.keys():
            tweet = tweet.replace(emoji, "EMOJI" + emojis[emoji])        
        # Replace @USERNAME to 'USER'.
        tweet = re.sub(userPattern,' USER', tweet)        
        # Replace all non alphabets.
        tweet = re.sub(alphaPattern, " ", tweet)
        # Replace 3 or more consecutive letters by 2 letter.
        tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

        tweetwords = ''
        for word in tweet.split():
            # Checking if the word is a stopword.
            if word not in stop_words:
                if len(word)>1:
                    # Lemmatizing the word.
                    word = wordLemm.lemmatize(word)
                    tweetwords += (word+' ')
            
        processedText.append(tweetwords)
        
    return processedText

In [7]:
%%time

# Pre-Processing
processedtext = processing(text)
print(f'Text Preprocessing complete.')
print(f'Time Taken:{round(time.time()-t)}seconds')

AttributeError: 'float' object has no attribute 'lower'

In [8]:
# Word-Cloud 
plt.figure(figsize = (20, 20))
wc = WordCloud(max_words= 1000, width = 1600, height = 800
               , collocations=False).generate(" ".join(processedtext))
plt.imshow(wc)

<Figure size 1440x1440 with 0 Axes>

NameError: name 'processedtext' is not defined

<Figure size 1440x1440 with 0 Axes>

In [None]:
# # TF-IDF Vectoriser
vectoriser = TfidfVectorizer(ngram_range=(1, 2), max_features=500000)
vectoriser.fit(processedtext)
print(f'Vectoriser fitted')
print('No. of feature_words:', len(vectoriser.get_feature_names()))

In [None]:
# Trangorming the data set
processedtext = vectoriser.transform(processedtext)

In [None]:
def load_models():
    '''
    Replace '..path/' by the path of the saved models.
    '''
    
    # Load the vectoriser.
    file = open('vectoriser-ngram-(1, 2).pickle', 'rb')
    vectoriser = pickle.load(file)
    file.close()
    # Load the LR Model.
    file = open('Sentiment-LR.pickle', 'rb')
    LRmodel = pickle.load(file)
    file.close()
    
    return vectoriser, LRmodel

def predict(vectoriser, model, text):
    # Predict the sentiment
    textdata = vectoriser.transform(processing(text))
    sentiment = model.predict(textdata)
    
    # Make a list of text with sentiment.
    data = []
    for text, pred in zip(text, sentiment):
        data.append((text,pred))
        
    # Convert the list into a Pandas DataFrame.
    df = pd.DataFrame(data, columns = ['text','sentiment'])
    df = df.replace([0,1], ["Negative","Positive"])
    return df

In [None]:
vectoriser, LRmodel = load_models()

df_test = predict(vectoriser, LRmodel, processedtext)
print(df_test.head())

In [None]:
df_test_1 = df_test.loc[df_test['sentiment'] == 'Negative']

In [None]:
df_test_1.head()

In [None]:
df_test_1['text'][1]

In [None]:
df_test_1.to_csv('first_test.csv', index = False)

In [None]:
def load_models():
    '''
    Replace '..path/' by the path of the saved models.
    '''
    
    # Load the vectoriser.
    file = open('vectoriser-ngram-(1, 2).pickle', 'rb')
    vectoriser = pickle.load(file)
    file.close()
    # Load the LR Model.
    file = open('LSTM-word-embedding-model.pickle', 'rb')
    LSTM_em = pickle.load(file)
    file.close()
    
    return vectoriser, LRmodel

def predict(vectoriser, model, text):
    # Predict the sentiment
    vectoriser = TfidfVectorizer(ngram_range=(1, 2), max_features=500000)
    vectoriser.fit(text)
    textdata = vectoriser.transform(processing(text))
    sentiment = model.predict(textdata)
    
    # Make a list of text with sentiment.
    data = []
    for text, pred in zip(text, sentiment):
        data.append((text,pred))
        
    # Convert the list into a Pandas DataFrame.
    df = pd.DataFrame(data, columns = ['text','sentiment'])
    df = df.replace([0,1], ["Negative","Positive"])
    return df

In [None]:
vectoriser, LSTM_em = load_models()



df_test_lstm = predict(vectoriser, LSTM_em, processedtext)
print(df_test_lstm.head())

In [None]:
df_test_lstm_1 = df_test.loc[df_test_lstm['sentiment'] == 'Negative']

In [None]:
df_test_1.iloc[0]

In [None]:
# compare
df_test_1.head()
df_test_lstm_1.head()

In [None]:
# df_test_lstm_1.to_csv('sec_test.csv', index = False)

In [None]:
# df_test_lstm_2 = df_test.loc[df_test_lstm['sentiment'] == 'Positive']
# df_test_lstm_2.head(20)

In [None]:
text_1 = ["I hate twitter",
        "May the Force be with you.",
        "Mr. Stark, I don't feel so good",
        '20 days', 
        'honored announce signed back tick thin particular man helped build brand']

vectoriser, LRmodel = load_models()



test = predict(vectoriser, LRmodel, text_1)
print(test.head())