# Import libraries

In [14]:
import pandas as pd
import numpy as np
import preprocessor as p
import emojis
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt 
import seaborn as sns

In [25]:
slang = {'luv':'love','wud':'would','lyk':'like','wateva':'whatever','ttyl':'talk to you later', 'rip':'rest in peace','kul':'cool','fyn':'fine','omg':'oh my god',
'fam':'family','bruh':'brother', 'bro':'brother','cud':'could','fud':'food','btw':'by the way',"can't":'cannot',"cant":"cannot","shouldn't":"should not",
"shouldnt":"should not","couldn't":"could not","couldnt":"could not"}

SMILEYS = {":‑(":"sad", ":‑(":"sad", ":(":"sad",":‑c":"sad",":c":"sad",":‑<":"sad",":<":"sad",":‑[":"sad",":[":"sad",":-||":"sad",">:[":"sad",":{":"sad",":@":"sad",":(	":"sad",";( ":"sad",":‑)":"happy", ":‑D" : "laughing", "8D":"laughing" , "x‑D": "laughing", "xD": "laughing","X‑D": "laughing",
"XD": "laughing","=D": "laughing", "=3": "laughing", "B^D": "laughing" , "c:" : "laughing", ":-]":"happy",":]": "happy",":-3": "happy", ":3":"happy", ":->": "happy",":>": "happy", "8-)": "happy","8)": "happy",":-}": "happy",":}": "happy",":o)": "happy",":c": "happy" , ":^)": "happy","=]": "happy", "=)": "happy",
":‑###..":"being sick",":###..":"being sick","',:-|":"disbelief","',:-l":"disbelief",">:‑)":"Evil","}:‑)":"Evil","}:)":"Evil","3:‑)":"Evil","3:)":"Evil",">;)":"Evil",";3":"Evil","D‑':":"horror"}


def convert_emojis(text):
    text = emojis.decode(text)
    text = text.replace(":"," ")
    text  = ' '.join(text.split())

    return text

def convert_emoticons(text):
    words = text.split()
    words = [SMILEYS[word] if word in SMILEYS else word for word in words]
    text = " ".join(words)
    return text 

def correction(text):
    words = text.split()
    words = [slang[word] if word in slang else word for word in words]
    text = " ".join(words)
    return text

In [26]:
class ETL():
    
    def __init__(self,twitter_data,Filepath):
        
        #assigning twitter data a variable
        self.twitter_data = twitter_data
        
        #assigning data variable 
        self.data = pd.read_csv(twitter_data)
        
        #location of saving preprocessed data
        self.Filepath = Filepath
        
        
    # loading data    
    def load_data(self):

        #print the dataset shape
        print('The number of tweets:\n{}\n\n'.format(self.data.shape[0]))

        #the feature of the data set are
        print('The features or columns in our dataset are {}'.format(list(self.data.columns)))
    
    # Data analysis
    def analysis(self):    
        
        self.plot_barchart()
    
        #Print the number of tweets with hate language
        print('The number of hate tweets:\n{}\n\n'.format(self.data[self.data['label'] == 1].shape[0]))

        #the percentage of tweets with hate language
        print('The percentage of hate tweets:\n{}\n\n'.format((self.data[self.data['label']==1].shape[0]/self.data.shape[0])*100))

        #print the number of tweets without hate language
        print('The number of tweets not classified as hate tweets:\n{}\n\n'.format(self.data[self.data['label']==0].shape[0]))

        #the percentage of tweets without hate language
        print('The percentage of tweets without hate language:\n{}\n\n'.format((self.data[self.data['label']==0].shape[0]/self.data.shape[0])*100))
        
    
    def data_cleaning(self):

        #removing null labels
        self.data = self.data[~self.data['label'].isnull()]
        print('The number of data point remaining after removing all null labels:\n{}\n\n'.format(self.data.shape[0]))

        #removing duplicate tweets
        self.data = self.data[~self.data['tweet'].duplicated()]
        print('The number of data point remaning after removing all duplicate tweets:\n{}\n\n'.format(self.data.shape[0]))

        #cleaning the remaning data according to our needs
        for i in self.data.index:    
            tweet = ""
            
            #removing the urls and 
            p.set_options(p.OPT.URL)
            tweet = p.clean(self.data['tweet'].loc[i])
            
            #conversion of emojis
            tweet = convert_emojis(tweet)

            #conversion of emoticons
            tweet = convert_emoticons(tweet)
            
            #removing punctuations
            tweet = ' '.join(re.sub("[#\.\,\!\?\:\;\-\=]", " ", tweet).split())
            
            #correcting slangs and misspells
            tweet = correction(tweet)

            #removing the remaining emoticons and numbers data in text
            p.set_options(p.OPT.MENTION,p.OPT.HASHTAG,p.OPT.RESERVED,p.OPT.EMOJI,p.OPT.SMILEY,p.OPT.NUMBER)
            tweet = p.clean(tweet)

            self.data['tweet'].loc[i] = tweet
            
        for index,rows in self.data.iterrows():
            strng = ""
            for words in rows['tweet'].split():

                #removing special characters 
                word = ("".join(i for i in words if i.isalnum()))

                #lowering the words
                word = word.lower()
                
                strng += word + " "

            self.data['tweet'].loc[index] = strng
        
        
        #printing the processed tweet
        print('The tweet text data is processed{}'.format(self.data['tweet']))
    
    def save_data(self):

        #saving data in a pickle file
        self.data.to_pickle(self.Filepath)

    def data_transform(self):
    
        print('LOADING DATA...{}\n\n '.format(self.twitter_data))
        self.load_data()

        print('DATA ANALYSIS...\n\n')
        self.analysis()

        print('CLEANING DATA...\n\n')
        self.data_cleaning()
        
        self.plot_hate_normal_tweets()
        
        print('SAVING DATA IN PICKLE FILE PREPROCESSED {}...\n\n'.format(self.Filepath))
        self.save_data()
        
        print('Cleaned data saved to pickle file preprocessed in Pickle folder')
        
        
        
    def plot_hate_normal_tweets(self):
        fig, axs = plt.subplots(1,2 , figsize=(16,8))
        text_pos = " ".join(self.data['tweet'][self.data.label == 0])
        text_neg = " ".join(self.data['tweet'][self.data.label == 1])
        train_cloud_pos = WordCloud(collocations = False, background_color = 'white').generate(text_pos)
        train_cloud_neg = WordCloud(collocations = False, background_color = 'black').generate(text_neg)
        axs[0].imshow(train_cloud_pos, interpolation='bilinear')
        axs[0].axis('off')
        axs[0].set_title('Non-Hate Comments')
        axs[1].imshow(train_cloud_neg, interpolation='bilinear')
        axs[1].axis('off')
        axs[1].set_title('Hate Comments')

        plt.show()
        
    def plot_barchart(self):        
        fig2 = sns.countplot(x= 'label',data = self.data)
        plt.title('Label Counts')
        plt.xticks([0,1],['Hate', 'Normal'])
        plot = fig2.get_figure()
        plot.savefig('Count Plot.png')
        