In [1]:
#for datawrangling and manipulation

import pandas as pd
import numpy as np

import tweepy
import json
import matplotlib.pyplot as plt

#for NLP text processing and formatting

import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import os

import time

plt.style.use('seaborn')
# For word lemmitization
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

# for word Stemming
from nltk.stem.porter import PorterStemmer

# for Machine Learning process

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# for Machine Learning model evaluation

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix


# Global Parameters
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SILENTONE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def preprocess_tweet_text(tweet):
    """
    Function to process the the tweet text and tranform it into format usable by Machine learning models
    """
    
    # to convert all the characters of the tweet into lower case alphabets
    tweet.lower()
    
    # Remove urls from the tweets
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user related references from the tweets:: '@' and '#' 
    tweet = re.sub(r'\@\w+|\#','', tweet)
    
    # Remove punctuations from the tweets
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords from the tweets
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]
    joined_text = " ".join(filtered_words)
    
    return joined_text

In [4]:
def get_feature_vector(train_fit):
    """
    Function to Convert a collection of raw documents to a matrix of TF-IDF features.
    """
    
    vector = TfidfVectorizer(sublinear_tf=True)      # Defining the vector
    vector.fit(train_fit)                            # fitting the data into the vector
    return vector                                    # returning the vector as function call

In [5]:
def int_to_string(sentiment):
    
    """
    Function to convert the integer score into corresponding sentiment
    """
    
    if sentiment == 0:
        return "Negative"
    elif sentiment == 2:
        return "Neutral"
    else:
        return "Positive"

# Importing the Dataset :

In [6]:
import pandas as pd

dataset = pd.read_csv("train.txt")
dataset

Unnamed: 0,tweet_id,sentiment,tweet_text
0,264183816548130816,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,negative,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...
...,...,...,...
21460,522949024132112384,neutral,"the day after newark ill be able to say """"i me..."
21461,522372593312350209,neutral,FEC hold farewell session for seven ministers ...
21462,522515200592052224,neutral,Luca Di Montezemolo (who's last day was Monday...
21463,523089087155437568,positive,Coffee is pretty much the answer to all questi...


### Preprocessing data before feeding it to ML models

In [7]:
processed_text = dataset['tweet_text'].apply(preprocess_tweet_text)
print("Processed text :: \n\n", processed_text)

Processed text :: 

 0          Gas house hit 339 Iu2019m going Chapel Hill Sat
1        Theo Walcott still shitu002c watch Rafa Johnny...
2        Iu2019m GSP fanu002c hate Nick Diaz canu2019t ...
3        Iranian general says Israelu2019s Iron Dome ca...
4        Tehranu002c Mon Amour Obama Tried Establish Ti...
                               ...                        
21460    day newark ill able say met demi lovato yester...
21461    FEC hold farewell session seven ministers Pres...
21462    Luca Di Montezemolo whos last day Monday Alons...
21463    Coffee pretty much answer questions today Frid...
21464    Niki Lauda confirmed Sky Alonso released conta...
Name: tweet_text, Length: 21465, dtype: object


# Stemming ::

 * It may be defined as the process to remove the inflectional forms of a word and bring them to a base form called the **stem**.
 * The chopped-off pieces are referred to as affixes
 * The two most common algorithms/methods employed for stemming include the ::
  
    * Porter Stemmer
    * Snowball Stemmer
    
We will be using `Porter Stemmer` in our process.

In [8]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(i) for i in processed_text]


# Lemmatization :: 

  * It is a process wherein the context is used to convert a word to its meaningful base form. 
  * It helps in grouping together words that have a common base form and so can be identified as a single item. 
  * The base form is referred to as the lemma of the word and is also sometimes known as the dictionary form.
  * The most commonly used lemmatizers are the 
    * WordNet Lemmatizer
    * Spacy Lemmatizer
    * TextBlob Lemmatizer

We will be using `WordNet Lemmatizer` in our process.

In [9]:
lemmatizer = WordNetLemmatizer()
lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]


# Vectorization ::

Processing natural language text and extract useful information from the given word or a sentence using machine learning and deep learning techniques requires the string/text needs to be converted into a set of real numbers (a vector) — **Word Embeddings**.

Word Embeddings or Word vectorization is a methodology in NLP to map words or phrases from vocabulary to a corresponding vector of real numbers which used to find word predictions, word similarities/semantics.

 The process of converting words into numbers are called `Vectorization`




In [10]:
tf_vector = get_feature_vector(np.array(dataset["tweet_text"]).ravel())

# Defining the Variables :

In [11]:
X = tf_vector.transform(np.array(dataset["tweet_text"]).ravel())     # Predictor Variable
y = np.array(dataset["sentiment"]).ravel()                           # Target varaible

### Splitting the data into training and testing data

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Using Naive Bayes Model :

In [13]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

### Predicting the values and the Accuracy Score

In [14]:
y_predict_nb = NB_model.predict(X_test)
print("Accuracy Score for Naive Bayes Model is :: ", accuracy_score(y_test, y_predict_nb))

Accuracy Score for Naive Bayes Model is ::  0.5918937805730259


# Classification Report :

In [15]:
print("Classification_Report :: \n\n", classification_report(y_test, y_predict_nb))

Classification_Report :: 

               precision    recall  f1-score   support

    negative       1.00      0.00      0.00       676
     neutral       0.60      0.60      0.60      1809
    positive       0.58      0.81      0.68      1808

    accuracy                           0.59      4293
   macro avg       0.73      0.47      0.43      4293
weighted avg       0.66      0.59      0.54      4293



# Using Logistic Regression Model :

In [16]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

### Predicting the Values :

In [17]:
y_predict_lr = LR_model.predict(X_test)
print("Accuracy Score for Logistic Regression Model is :: ",accuracy_score(y_test, y_predict_lr))

Accuracy Score for Logistic Regression Model is ::  0.6389471232238528


# Classification Report

In [18]:
from sklearn.metrics import classification_report

print("Classification_Report :: \n\n", classification_report(y_test, y_predict_lr))

Classification_Report :: 

               precision    recall  f1-score   support

    negative       0.63      0.21      0.32       676
     neutral       0.60      0.74      0.66      1809
    positive       0.68      0.70      0.69      1808

    accuracy                           0.64      4293
   macro avg       0.64      0.55      0.56      4293
weighted avg       0.64      0.64      0.62      4293



# Importing the Test Data :

In [19]:
test = pd.read_csv("test.txt")
test

Unnamed: 0,tweet_id,tweet_text
0,264238274963451904,"@jjuueellzz down in the Atlantic city, ventnor..."
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,262926411352903682,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,171874368908050432,"Excuse the connectivity of this live stream, f..."
...,...,...
5393,210378118865756160,It's a Wednesday girls night out as '90's band...
5394,245177521304399872,"night college course sorted, just have to enro..."
5395,259280987089932288,For the 1st time in 30 years. For your splendi...
5396,201113950211940352,NURSES DAY - 12 MAY 2012. Nursing: The heart b...


In [20]:
test.tweet_text = test["tweet_text"].apply(preprocess_tweet_text)
test_feature = tf_vector.transform(np.array(test['tweet_text']).ravel())


# Using Naive Bayes Model for Prediction ::

In [21]:

test_prediction_nb = NB_model.predict(test_feature)

#test_prediction_nb = NB_model.predict(['Hi you are '])

test_prediction_nb


array(['neutral', 'positive', 'neutral', ..., 'positive', 'neutral',
       'positive'], dtype='<U8')

In [22]:
# Creating a Dataframe consising tweets and sentiment in a submission format

submission_result_nb = pd.DataFrame({'tweet_id': test.tweet_id, 'sentiment':test_prediction_nb})
submission_result_nb

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,positive
2,258965201766998017,neutral
3,262926411352903682,positive
4,171874368908050432,neutral
...,...,...
5393,210378118865756160,neutral
5394,245177521304399872,positive
5395,259280987089932288,positive
5396,201113950211940352,neutral


In [23]:
# Total number os tweets grouped according sentiment

test_result = submission_result_nb['sentiment'].value_counts()
test_result

positive    3177
neutral     2220
negative       1
Name: sentiment, dtype: int64

# Using Logistic Regression Model for Prediction ::

In [24]:
test_prediction_lr = LR_model.predict(test_feature)
test_prediction_lr

array(['neutral', 'positive', 'neutral', ..., 'neutral', 'neutral',
       'positive'], dtype=object)

In [25]:
# Creating a Dataframe consising tweets and sentiment

submission_result_lr = pd.DataFrame({'tweet_id': test.tweet_id, 'sentiment':test_prediction_nb})
submission_result_lr

Unnamed: 0,tweet_id,sentiment
0,264238274963451904,neutral
1,218775148495515649,positive
2,258965201766998017,neutral
3,262926411352903682,positive
4,171874368908050432,neutral
...,...,...
5393,210378118865756160,neutral
5394,245177521304399872,positive
5395,259280987089932288,positive
5396,201113950211940352,neutral


In [26]:
# Total number os tweets grouped according sentiment

test_result2 = submission_result_lr['sentiment'].value_counts()
test_result2

positive    3177
neutral     2220
negative       1
Name: sentiment, dtype: int64

In [27]:
from chatterbot import ChatBot
from chatterbot.trainers import ListTrainer

In [28]:
from nltk.chat.util import Chat, reflections

In [29]:
def tweetsentiment(msg):
    test.tweet_text = preprocess_tweet_text(msg)
    test_feature = tf_vector.transform(np.array(msg).ravel())
    test_prediction_lr = LR_model.predict(test_feature)
    return (test_prediction_lr[0])

#tweetsentiment("Hi how are you")

In [None]:
import json
from difflib import get_close_matches
from tkinter import *

class Chatbot:
    def __init__(self, window):
        window.title('Twitter Sentiment Analysis')
        window.geometry('400x450')
        window.resizable(0,0)
        self.message_session = Text(window, bd=3, relief="flat", font=("Times", 10), undo=True, wrap="word")
        self.message_session.config(width=45, height=15,bg="#596", fg="white", state='disabled')
        self.overscroll = Scrollbar(window, command=self.message_session.yview)
        self.overscroll.config(width=20)
        self.message_session["yscrollcommand"] = self.overscroll.set
        self.message_position = 1.5
        self.send_button = Button(window, text='send', fg='white', bg='blue',width=9,font=('Times', 12), relief ='flat', command = self.reply_to_you)
        self.Message_Entry = Entry(window, width=40, font=('Times', 12))
        self.Message_Entry.bind('<Return>', self.reply_to_you)
        self.message_session.place(x=20, y=20)
        self.overscroll.place(x=370, y=50)
        self.send_button.place(x=0, y=360)
        self.Message_Entry.place(x=135, y=365)
        #self.Brain = json.load(open('knowledge.json'))

    def add_chat(self, message):
        self.message_position+=1.5
        print(self.message_position)
        self.Message_Entry.delete(0, 'end')
        self.message_session.config(state='normal')
        self.message_session.insert(self.message_position, message)
        self.message_session.see('end')
        self.message_session.config(state='disabled')
        
    def reply_to_you(self, event=None):
        message = self.Message_Entry.get().lower()
        message = 'TS Model: '+tweetsentiment(message) +'\n'
        self.add_chat(message)
        self.add_chat(reply)

root = Tk()
main_menu = Menu(root)

# Create the submenu 
file_menu = Menu(root)

# Add commands to submenu
file_menu.add_command(label="New..")
file_menu.add_command(label="Save As..")
file_menu.add_command(label="Exit")
main_menu.add_cascade(label="File", menu=file_menu)
#Add the rest of the menu options to the main menu
main_menu.add_command(label="Edit")
main_menu.add_command(label="Quit")
root.config(menu=main_menu)
Chatbot(root)
root.mainloop()


3.0


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\SILENTONE\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "<ipython-input-32-ef343b939493>", line 38, in reply_to_you
    self.add_chat(reply)
NameError: name 'reply' is not defined


4.5


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\SILENTONE\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "<ipython-input-32-ef343b939493>", line 38, in reply_to_you
    self.add_chat(reply)
NameError: name 'reply' is not defined


6.0


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\SILENTONE\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "<ipython-input-32-ef343b939493>", line 38, in reply_to_you
    self.add_chat(reply)
NameError: name 'reply' is not defined


7.5


Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\SILENTONE\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
    return self.func(*args)
  File "<ipython-input-32-ef343b939493>", line 38, in reply_to_you
    self.add_chat(reply)
NameError: name 'reply' is not defined
