In [None]:
# import important libraries
import pandas as pd
import numpy as np
import time
import configparser
import datetime
import glob
import os

# import mysql libraries
import pymysql
from sqlalchemy import create_engine
import mysql.connector

# import important NLP tensorflow libraries
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# import transformers libery to label the dataset
from transformers import pipeline

# convert emoji to text libery
import emoji

# tokenize libery
import nltk
from nltk.tokenize import TweetTokenizer

# remove stopwords and punctuation libery
from nltk.corpus import stopwords
import string

# Lemmatization libery
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# download corpus
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [None]:
def mysql_conf():
        config = configparser.ConfigParser()
        config.read('config.ini')
        username = config['mysql']['username']
        password = config['mysql']['password']
        hostname = config['mysql']['hostname']
        database = config['mysql']['database']
        return username,password,hostname,database

def create_database():
    username,password,hostname,database = mysql_conf()
    conn = mysql.connector.connect(host = hostname,user = username,password = password)
    cursor = conn.cursor()
    
    cursor.execute("CREATE DATABASE IF NOT EXISTS twitter")
    cursor.close()


def create_table():
    username,password,hostname,database = mysql_conf()
    conn = mysql.connector.connect(host = hostname,user = username,password = password, database = database)
    cursor = conn.cursor()
    
    cursor.execute("CREATE TABLE IF NOT EXISTS sentiment(tweet_id double NOT NULL,\
                                text text(65535) NOT NULL,\
                                tokenize text(65535) NOT NULL,\
                                sentiment varchar(25) NOT NULL,\
                                PRIMARY KEY (tweet_id ),\
                                FOREIGN KEY (tweet_id) REFERENCES datamined(tweet_id))")
    cursor.close()
    
def read_from_mysql():
    username,password,hostname,database = mysql_conf()
    engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}".format(host= hostname, db= database, user= username, pw= password))
    df = pd.read_sql('SELECT tweet_id,text FROM datamined where processed = False', con=engine)

    return df

def write_to_mysql(df):
    username,password,hostname,database = mysql_conf()
    df = df.drop(columns=['index'])
    engine = create_engine("mysql+pymysql://{user}:{pw}@{host}/{db}".format(host= hostname, db= database, user= username, pw= password))
    df.to_sql('sentiment', con = engine, if_exists = 'append',index = False, chunksize = 1000)
    
    conn = mysql.connector.connect(host = hostname,user = username,password = password, database = database)
    cursor = conn.cursor()
    
    quary = """UPDATE datamined SET processed = %s WHERE tweet_id =  %s"""
    for index in df.index:
        val = (True,df["tweet_id"][index])
        cursor.execute(quary, val)
    conn.commit()
    cursor.close()
    
    print("................. Resetting Dataframe")

In [None]:
# cleaning Dataframe
def cleaning_df(preprocessed_df):
    # remove hash sign for hashtag. preserve hashtag as it may contain information
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("#", "", regex=True)

    # remove website link
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("https?:\\/\\/[^\\s]+[\\r\\n]*", "", regex=True)

    # remove newline
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("\n", " ", regex=True)

    # Remove retweet, user tag
    preprocessed_df["text"] = preprocessed_df["text"].str.replace('RT @[^\s]+', "", regex=True)
    preprocessed_df["text"] = preprocessed_df["text"].str.replace('@[^\s]+', "", regex=True)

    # remove $ symbol
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("$", "", regex=True)

    # remove ’ and '  symbol
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("’", "", regex=True)
    preprocessed_df["text"] = preprocessed_df["text"].str.replace("'", "", regex=True)

    # convert to lower case
    preprocessed_df["text"] = preprocessed_df["text"].str.lower()
    return  preprocessed_df

#Convert emojis into text
def convert_emoji(preprocessed_emoji_df):
    # Convert emoji to text 
    preprocessed_emoji_df["text"] = preprocessed_emoji_df["text"].apply(emoji.demojize)
    # remove Colon and double Colons 
    preprocessed_emoji_df["text"] = preprocessed_emoji_df["text"].str.replace("::", " ", regex=True)
    preprocessed_emoji_df["text"] = preprocessed_emoji_df["text"].str.replace(":", " ", regex=True)
    preprocessed_emoji_df["text"] = preprocessed_emoji_df["text"].str.replace("_", " ", regex=True)
    return preprocessed_emoji_df

#Tokenization of the text
def tokenize(preprocessed_token_df):
    # tokenize
    tokenizer = TweetTokenizer(preserve_case=False,
                                strip_handles=True,
                                reduce_len=True)
    preprocessed_token_df['tokenize'] = preprocessed_token_df["text"].apply(tokenizer.tokenize)
    return preprocessed_token_df

#Remove stopwords and punctuation
def stopwords(preprocessed_stop_df):
    from nltk.corpus import stopwords
    # remove stopwords and punctuation
    english_stopwords = stopwords.words('english')
    preprocessed_stop_df['tokenize'] =  preprocessed_stop_df['tokenize'].apply(lambda x: [item for item in x if item not in english_stopwords and item not in string.punctuation])
    return preprocessed_stop_df

#Lemmatization
def lemmatization(preprocessed_lemmatization_df):
    preprocessed_lemmatization_df['tokenize'] = preprocessed_lemmatization_df["tokenize"].apply(lambda x: nltk.pos_tag(x))
    lemmatizer = WordNetLemmatizer()
    preprocessed_lemmatization_df['tokenize'] = preprocessed_lemmatization_df["tokenize"].transform(lambda value: " ".join([lemmatizer.lemmatize(a[0],pos=get_wordnet_pos(a[1])) if get_wordnet_pos(a[1]) else a[0] for a in value]))
    # Sample code to see labels for first five rows in df
    for i in preprocessed_lemmatization_df.index:
      # now call function we defined above
      if preprocessed_lemmatization_df["tokenize"][i] == '' or None:
            preprocessed_lemmatization_df = preprocessed_lemmatization_df.drop(i)
        
    # reset the datarame index 
    preprocessed_lemmatization_df = preprocessed_lemmatization_df.reset_index()
    return preprocessed_lemmatization_df

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None

#sentiment Analysis with HuggingFace
def huggingface(df,candidate_labels):
    df["sentiment"] = np.nan
    # device=0 for GPU usage
    classifier = pipeline("zero-shot-classification",model="facebook/bart-large-mnli", device=-1)

    # Sample code to see labels for first five rows in df
    for i in df.index:
        # now call function we defined above
        input_text = df['tokenize'][i]

        # multi_label=True will return confidence score for both labels independently 
        model_dict = classifier(input_text, candidate_labels, multi_label=True)

        # Zip results to dict
        result_dict = dict(zip(model_dict.get('labels'), model_dict.get('scores')))

        if ((result_dict.get('Positive') > result_dict.get('Negative')) and (result_dict.get('Positive') > result_dict.get('Neutral'))) :
            sentiment = "Positive"
        elif ((result_dict.get('Negative') > result_dict.get('Positive')) and (result_dict.get('Negative') > result_dict.get('Neutral'))) :
            sentiment = "Negative"
        else :
            sentiment = "Neutral"
        df['sentiment'][i] = sentiment
        print("line number:",i,"out of",len(df),"lines",round(i/len(df)*100,2),"% completed",end="\r")
    return df

In [None]:
def sentiment_analysis():
    # Output labels
    candidate_labels = ["Positive","Negative","Neutral"]
    
    #Create Databases and tables
    create_database()
    create_table()
    
    while True:
        df = read_from_mysql()
        df = cleaning_df(df)
        df = convert_emoji(df)
        df = tokenize(df)
        df = stopwords(df)
        df = lemmatization(df)
        df = huggingface(df,candidate_labels)
        if df.empty == True:
            print("",end="\r")
            print("........No new tweets, sleeping for 30 seconds!")
            time.sleep(30)
            continue
        write_to_mysql(df)
        print("........sleeping for 5 seconds!")
        time.sleep(5)

In [None]:
sentiment_analysis()