Utility functions for the rest of the script

In [1]:
"""
Created on Mon Jul 22 22:44:42 2019

@author: ammar
"""
import pandas as pd
import os as os
import nltk
import re
import matplotlib.pyplot as plt

os.chdir("C:\\Users\\ammar\\Desktop\\Text-Analytics-Msc\\dl_lab")

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def clean_ascii_sentence(phrase):
    phrase = str(phrase).lower()
    phrase = phrase.replace('<.*?>', '') #remove tags
    phrase = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",phrase).split())
    return phrase

def clean_unicode_sentence(phrase):
    phrase = str(phrase).lower()
    phrase = phrase.replace('<.*?>', '')
    return phrase
    
def seperate_and_clean_data(data):
    ascii_data = pd.DataFrame(columns = ['PhraseNo', 'Phrase' , 'IsHateSpeech' , "Tokens"])
    unicode_data = pd.DataFrame(columns = ['PhraseNo', 'Phrase' , 'IsHateSpeech' , "Tokens"]);

    ascii_Phrase = []    
    ascii_PhraseNo = []
    ascii_IsHateSpeech =  []
    ascii_Tokens = [];
    unicode_Phrase = []    
    unicode_PhraseNo = []
    unicode_IsHateSpeech =  []
    unicode_Tokens = []
    
    for row in data.iterrows():
        ascii_sentence = [];
        unicode_sentence = [];
        default_wt = nltk.word_tokenize
        tokens = default_wt(row[1].Phrase)
        for word in tokens:
            if(is_ascii(word)):
                    word = clean_ascii_sentence(word);
                    if( word != ''):
                        ascii_sentence.append(word)
            elif(word != ''):
                word = clean_unicode_sentence(word);
                unicode_sentence.append(word)
        if(len(ascii_sentence) > 0 ):
            ascii_Tokens.append(ascii_sentence)
            if(row[1].IsHateSpeech=='YES'):
                ascii_IsHateSpeech.append(1)
            else:
                ascii_IsHateSpeech.append(0)
            ascii_PhraseNo.append(row[1].PhraseNo)
            ascii_Phrase.append(tokens)
        if(len(unicode_sentence) > 0 ):
            unicode_Tokens.append(unicode_sentence)
            if(row[1].IsHateSpeech=='YES'):
                unicode_IsHateSpeech.append(1)
            else:
                unicode_IsHateSpeech.append(0)
            unicode_PhraseNo.append(row[1].PhraseNo)
            unicode_Phrase.append(tokens)
        
    ascii_data["Phrase"]=ascii_Phrase
    ascii_data["PhraseNo"]=ascii_PhraseNo
    ascii_data["IsHateSpeech"]=ascii_IsHateSpeech
    ascii_data["Tokens"]=ascii_Tokens
    
    unicode_data["Phrase"]=unicode_Phrase
    unicode_data["PhraseNo"]=unicode_PhraseNo
    unicode_data["IsHateSpeech"]=unicode_IsHateSpeech
    unicode_data["Tokens"]=unicode_Tokens
    
    return ascii_data, unicode_data;


def tokens_to_sentences(tokens):
    sentences = []
    for sent in tokens:
        unique_words = set(sent)
        sentences.append(" ".join(unique_words))
    return sentences


def load_stop_words():
    stop_data = pd.read_csv('stop_words.csv', encoding = 'utf-8') #,encoding = 'ISO-8859-1'
    stopwords = [];
    for index,row in stop_data.iterrows():
        stopwords.append(row[0])
    return stopwords;
        
#create a stemmer
def load_stem_dictionary():
    stem_data = pd.read_csv('sinhalese_stems.csv') #,encoding = 'ISO-8859-1'
    stems = pd.DataFrame(columns = ['Stem', 'Word'])
    dictionary = {};
    for index, row in stem_data.iterrows():
         s = re.split(r'\t+', row[0]);
         dictionary[s[0]]=s[1];
    return dictionary;

def load_stem_df():
    stem_data = pd.read_csv('sinhalese_stems.csv')
    stems = pd.DataFrame(columns = ['Stem', 'Word'])
    
    word = [];
    stem = [];
    for index, row in stem_data.iterrows():
         s = re.split(r'\t+', row[0]);
         stem.append(s[1])
         word.append(s[0])
    stems.Stem = stem;
    stems.Word = word;
    return stems;
    

def word_feats(words, stopwords):
    return dict([(' '.join(word.split()), True) for word in words if ' '.join(word.split()) not in stopwords])


def non_stop_words(sentence_tokens):
    non_stop_tokens = [];
    for sentence in sentence_tokens:
        tokens = []
        token_dictionary = (word_feats(sentence,stopwords))
        for i in token_dictionary.keys():
            tokens.append(i)
        non_stop_tokens.append(tokens)
    return non_stop_tokens;


def get_stem(word):
    try:
        stem = stem_df[stem_df.Word == ' '.join(word.split())].Stem;
        if(len(stem) > 0):
            return stem.max()
        else:
            return word
        return 
    except Exception:
        return word


def stem_all(sentences):
    stem_sentences = [];
    cnt = 0;
    for sentence in sentences:
 #       print(sentence)
        cnt += 1;
        stems = []
        for word in sentence:
            i = get_stem(word)
            stems.append(i);
        stem_sentences.append(stems)
    return stem_sentences;



def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()


Cleaning data

In [2]:
data=pd.read_csv('labelled_data.csv') #,encoding = 'ISO-8859-1'
ascii_dataset , unicode_dataset= seperate_and_clean_data(data)

stem_dictionary = load_stem_dictionary()
stopwords = load_stop_words()
stem_df = load_stem_df()

#stop word removal, stemming
#dataset.Tokens = extract_tokens(dataset.Tokens)
unicode_dataset.NonStop = non_stop_words(unicode_dataset.Tokens)
unicode_dataset.Stems = stem_all(unicode_dataset.NonStop)


df = pd.DataFrame(columns= ["sentence", "label"]);
df['sentence'] = tokens_to_sentences(unicode_dataset.Stems);
df['label'] = unicode_dataset.IsHateSpeech


  # Remove the CWD from sys.path while we load stuff.
  # This is added back by InteractiveShellApp.init_path()


Splitting test and training sets

In [3]:
sentences_train, sentences_test, y_train, y_test = train_test_split(df['sentence'],df['label'] , test_size=0.25, random_state = 1000)

NameError: name 'train_test_split' is not defined

Vectoring training and test sentences

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test = vectorizer.transform(sentences_test)


Creating and training nueral network

In [None]:
from keras.models import Sequential
from keras import layers

input_dim  = X_train.shape[1]

model = Sequential()
model.add(layers.Dense( 10, input_dim=input_dim , activation="relu"))
model.add(layers.Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy" , 
              optimizer = "adam",
              metrics = ['accuracy']
              )
model.summary()
history  = model.fit(X_train, y_train, 
                     epochs = 100,  
                     verbose = False, 
                     validation_data= (X_test, y_test),
                     batch_size = 10
                     )


Evaluation

In [None]:
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))
plt.style.use('ggplot')

plot_history(history)
