## Imports

In [None]:
import pandas as pd
import numpy as np
import string
import random
from termcolor import colored
from collections import defaultdict 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import skipthoughts
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
import spacy
import unidecode
from sklearn import preprocessing
import ktrain
from ktrain import text

## Tokenizer

In [None]:
tok_nlp = spacy.load('en_core_web_sm')

def tokenizer(sentence):
    sentence = unidecode.unidecode(sentence.lower())
    mytokens = []
    sentence_len = len(sentence.split(' '))
    
    for token in tok_nlp(sentence):
        if (
            not token.is_stop and 
            not token.is_punct and
#             not token.pos_ == 'PROPN' and 
            not token.is_space
        ):
            mytokens.append(token.lemma_.strip())
    
    return ' '.join(mytokens)

In [None]:
def tokenizer(sentence):
    sentence = unidecode.unidecode(sentence.lower())
    mytokens = []
    for token in tok_nlp(sentence):
        if (not token.is_punct and not token.is_space):
            lemma = token.lemma_
            if (lemma == '-PRON-'):
                mytokens.append(token.text.strip())
            else:
                mytokens.append(token.lemma_.strip())
    
    return ' '.join(mytokens)

In [None]:
tokenizer("how are you doing apple?")

## Helper Fuctions

In [None]:
def get_accuracy(preds, labels):
    print("Accuracy:", metrics.accuracy_score(y_true=labels, y_pred=preds ,normalize=False))
    print(metrics.classification_report(y_true=labels, y_pred=preds))
    
def save_model_results(preds):
    pd.DataFrame(preds).to_excel("temp.xlsx")

## Reading Train and Test Data

**Random_State = 45**

In [None]:
df = pd.read_excel('Training Phrases.xlsx', sheet_name='Testing of Models', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]
df['Text'] = df['Text'].apply(tokenizer)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

test_emails_df = pd.read_excel('Training Phrases.xlsx', sheet_name='Manual Test cases', nrows=23)
test_emails_df = test_emails_df.drop(test_emails_df.index[0])
test_emails_df.columns = ["Label", "Email"]
test_emails_df['Email'] = test_emails_df['Email'].apply(tokenizer)
test_emails_df['Label'] = test_emails_df['Label'].apply(lambda label: '_'.join(label.lower().split(' ')))

In [None]:
classes = df.Label.unique()
print("Total classes: ", len(classes))
df.Label.value_counts()

## Reformatting Data to Reduce Classes

In [None]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "assessment" in value:
        new_df = new_df.append({'Label': "assessment"}, ignore_index=True)
    elif "interview" in value:
        new_df = new_df.append({'Label': "interview"}, ignore_index=True)
    elif (
        "job" in value or 
        value == "cv_past_experience" or
        value == "application_status" or
        value == "howto_apply" or
        value == "feedback" or
        value == "multiple_role" or
        value == "reinstate_application"
    ):
        new_df = new_df.append({'Label': "job_application"}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "job_details"}, ignore_index=True)
        
new_df.to_excel("4 Classes.xlsx")

### Assessment Train Data

In [None]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "assessment" in value:
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Assessment Train.xlsx")

### Interview Train Data

In [None]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "interview" in value:
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Interview Train.xlsx")

### Job Details Train Data

In [None]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if (
        "job" in value or 
        value == "cv_past_experience" or
        value == "application_status" or
        value == "howto_apply" or
        value == "feedback" or
        value == "multiple_role" or
        value == "reinstate_application"
    ):
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
 
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Job Detail Train.xlsx")

### Job Application Train Data

In [None]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if (
        value == "salary" or
        value == "work_experience" or
        value == "age_limit" or
        value == "special_needs_at_work" or
        value == "disability"
    ):
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Job Application Train.xlsx")

## To-Do

1. Remove salutations from emails
1. Random Forests
1. Ensure word embeddings are correct
1. Test and Train accuracies should be similar to avoid overfitting
1. Reduce the test set

## Sklearn

### Saving Model Output In Excel File

In [None]:
# Generating mapping column
a = pd.DataFrame(preds)
b = pd.DataFrame(df['Label'])
a.columns = ['Label']
b = b.reset_index(drop=True)
a = a.reset_index(drop=True)

(a==b).to_excel("temp.xlsx")

In [None]:
pd.DataFrame(robert_predictions_labels).to_excel("preds.xlsx")

### Training and Testing

#### Multi Stage Models

In [None]:
df = pd.read_excel('Training Phrases - 4 Classes.xlsx', sheet_name='Testing of Models', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]
df['Text'] = df['Text'].apply(tokenizer)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

outer_pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

outer_pipe.fit(x_train, y_train)
predicted = outer_pipe.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

In [None]:
total_correct = 0

for i in range(0, x_test.shape[0]):
    v = x_test.iloc[i]
    main_class = outer_pipe.predict([v])
    pred = None
    
    if (main_class == 'job_application'):
        pred = job_app_model.predict([v])[0]
    
    elif (main_class == 'interview'):
        pred = interview_model.predict([v])[0]
    
    elif (main_class == 'job_details'):
        pred = job_detail_model.predict([v])[0]
    
    elif (main_class == 'assessment'):
        pred = assessment_model.predict([v])[0]
    
    if y_test.iloc[i] == pred:
        total_correct += 1
    else:
        print ("\nConfused:", v, "\nActual:", y_test.iloc[i], "\nPredicted:", pred)
      
print("\nACCURACY: ", total_correct / x_test.shape[0])

In [None]:
df = pd.read_excel('Assessment Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

assessment_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

assessment_model.fit(x_train, y_train)
predicted = assessment_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

In [None]:
df = pd.read_excel('Interview Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

interview_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

interview_model.fit(x_train, y_train)
predicted = interview_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

In [None]:
df = pd.read_excel('Job Application Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

job_app_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

job_app_model.fit(x_train, y_train)
predicted = job_app_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

In [None]:
df = pd.read_excel('Job Detail Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

job_detail_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

job_detail_model.fit(x_train, y_train)
predicted = job_detail_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

#### Single Model

In [None]:
classifier = SVC(C=150, gamma=0.02, probability=True)

pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ('tfid',  TfidfVectorizer(ngram_range=(1, 3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True, class_weight='balanced'))
    ])

pipe.fit(x_train, y_train)
predicted = pipe.predict_proba(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=pipe.predict(x_test), normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=pipe.predict(x_test)))

#### Testing on Emails

1. SVC = 6/21
1. roberta-untok = 11/21
1. roberta-large = 9/21
1. roberta-v4 = 13/21

In [None]:
email_preds = predictor.predict(test_emails_df['Email'].values)
print("Accuracy:", metrics.accuracy_score(y_true=test_emails_df['Label'].values, y_pred=email_preds, normalize=False))
print(metrics.classification_report(y_true=test_emails_df['Label'].values, y_pred=email_preds))

### Extensive Model Comparison

In [None]:
total_a = 0
total_b = 0

total_iters = 10
for i in range(0, total_iters):
    print("\nIteration: ", i)
    df = pd.read_excel('Training Phrases.xlsx', sheet_name='Testing of Models', nrows=456)
    df = df.drop(df.index[0])
    df = df[df.columns[[1, 2]]]
    df.columns = ["Text", "Label"]
    df['Text'] = df['Text'].apply(tokenizer)
    x_train, x_test, y_train, y_test = train_test_split(df["Text"], df["Label"], random_state=i, test_size=0.2)
    
#     vectorizer = TfidfVectorizer(ngram_range=(1,3)) 
#     classifier = SVC(C=150, gamma=0.02, probability=True)
#     pipe = Pipeline([('vectorizer', vectorizer),
#                      ('classifier', classifier)])
                     

#     pipe.fit(x_train, y_train)
    predicted = ensemble.predict(x_test.values)
    accuracy_a = metrics.accuracy_score(y_test, predicted)
    total_a += accuracy_a
    print("Accuracy A:", accuracy_a)
    
#     vectorizer = TfidfVectorizer(ngram_range=(1,3)) 
    classifier = SVC(C=150, gamma=0.02, probability=True)
    pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", classifier),
        ]
    )
    pipe.fit(x_train, y_train)
    predicted = pipe.predict(x_test)
    
    accuracy_b = metrics.accuracy_score(y_test, predicted)
    total_b += accuracy_b
    print("Accuracy B:", accuracy_b)

print("\nAverage Accuracy A:", total_a/total_iters)
print("Average Accuracy B:", total_b/total_iters)

### spaCy Vectorizer

In [None]:
import spacy 
from sklearn.base import BaseEstimator, TransformerMixin
nlp = spacy.load("en_core_web_lg")  

class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp

    def fit(self, X, y):
        return self

    def transform(self, X):
        return [self.nlp(text).vector for text in X]

### Embeddings Vectorizer

In [None]:
class GloveTransformer(TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):    
        return [len([tok for tok in tok_nlp(doc) if tok.pos_ == u'VERB']) for doc in X]
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
    
glove_transformer = GloveTransformer()

## spaCy 

In [None]:
def create_cat_dict(label):
    mydict = {}
    for class_ in classes:
        mydict[class_] = (class_ == label)
        
    return mydict


def get_preds(texts, labels, output=False):
    total_correct = 0
    wrong_preds = defaultdict(int)
    preds = []
    
    for test_phrase in zip(texts, labels):
        phrase = test_phrase[0]
        label = [key for (key, value) in test_phrase[1].items() if value == True][0]
        pred_cats = nlp(phrase).cats
        pred = max(pred_cats, key=pred_cats.get)
        preds.append(pred)
        
        if (label == pred):
            total_correct += 1
            
        else:
            wrong_preds[label] += 1
            
            if (output):
                print("\n", "-"*15, "\nPhrase:", phrase)
                print("Label:", label)
                print("Prediction:", pred)
        
    return (total_correct, wrong_preds, preds)


def load_data(split=0.80, tok=False):
    if tok:
        df['tuples'] = df.apply(lambda row: (tokenizer(row['Text']), row['Label']), axis=1)
    else:
        df['tuples'] = df.apply(lambda row: (row['Text'], row['Label']), axis=1)
    
    train_data = df['tuples'].tolist()
#     random.shuffle(train_data)
    texts, labels = zip(*train_data)
    cats = [create_cat_dict(y) for y in labels]
    split = int(len(train_data) * split)
    
#     return (texts[:split], cats[:split]), (texts[split:], cats[split:])
    return (texts, cats)

In [None]:
# (train_texts, train_cats), (dev_texts, dev_cats) = load_data(tok=True)
(texts, cats) = load_data(tok=True)

In [None]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

from spacy.util import minibatch, compounding


# Parameters
n_iter = 40
drop = 0.15
architecture = "simple_cnn"


# for i in range(0,5):
(train_texts, train_cats), (dev_texts, dev_cats) = load_data_v2()

nlp = spacy.blank("en")  
textcat = nlp.create_pipe(
    "textcat", config={"exclusive_classes": True, "architecture": architecture}
)
nlp.add_pipe(textcat, last=True)

for label in classes:
    textcat.add_label(label)

train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  
    optimizer = nlp.begin_training()
    print("Training the model...")
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=drop, losses=losses)

# Model Results

train_total_correct, train_inaccuracies, train_preds = get_preds(train_texts, train_cats)
test_total_correct, test_inaccuracies, test_preds = get_preds(dev_texts, dev_cats)

total_correct = test_total_correct + train_total_correct
total_rows = len(dev_texts) + len(train_texts)

print("\n[TEST SET RESULTS]\n",
      "   \nTotal Correct:", test_total_correct, 
      "   \nTotal Wrong:", len(dev_texts)-test_total_correct, 
      "   \nTEST-SET ACCURACY: ", test_total_correct/len(dev_texts),
      "   \nIncorrect Predictions:\n ", test_inaccuracies
     )

print("\n[TRAIN SET RESULTS]\n",
      "   \nTotal Correct:", train_total_correct, 
      "   \nTotal Wrong:", len(train_texts)-train_total_correct, 
      "   \nTRAIN-SET ACCURACY: ", train_total_correct/len(train_texts),
      "   \nIncorrect Predictions:\n ", train_inaccuracies
     )

print("\n[OVERALL RESULTS]\n",
      "   \nTotal Correct:", total_correct, 
      "   \nTotal Wrong:", total_rows-total_correct,
      "   \nOVERALL ACCURACY: ", total_correct/total_rows)


## Data Augmentation

In [None]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
fast_aug = naw.WordEmbsAug(
    model_type='fasttext', model_path='./models/wiki-news-300d-1M.vec',
    action="substitute")

w2c_aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='./models/GoogleNews-vectors-negative300.bin',
    action="substitute")

text = "What is the process of applying for this job?"
print("Original:", text)
print("Augmented (fasttext):", fast_aug.augment(text) )
print("Augmented (w2v):", w2c_aug.augment(text) )

In [None]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

## Ludwig

In [None]:
from ludwig.api import LudwigModel
import logging

**model_definition = {'input_features': [{'name': 'Text', 'type': 'text'}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}]}**
                    
 56%
 
 **model_definition = {'input_features': [{'name': 'Text', 'type': 'text', 'encoder': 'rnn'}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}],
                    'training': {'epochs': 25}}**
                    
 30%
 
 **model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}],
                    'training': {'epochs': 25}}**
                    
48%

**model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}]}**
                    
51%

In [None]:
data = pd.read_csv("Train.csv")
data = data.dropna()
df['Text'] = df['Text'].apply(tokenizer)

data.Label.value_counts()
training_dataframe, validation_dataframe = train_test_split(data,
                                                      test_size=0.2,  
                                                      random_state=42
                                                      )
validation_dataframe.reset_index(inplace=True)

model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "level": "word", "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}], 'training': {'epochs':50}}

model = LudwigModel(model_definition)

training_stats = model.train(training_dataframe, logging_level=logging.INFO)

In [None]:
predictions_dataframe = model.predict(validation_dataframe)
total_correct = 0

for i in range(0, validation_dataframe.shape[0]):
    
    if predictions_dataframe.Label_predictions[i] == validation_dataframe.Label[i]:
        total_correct += 1
#     else:
#         print("\n\nPhrase: ", validation_dataframe.Text[i])
#         print("Prediction: ", predictions_dataframe.Label_predictions[i])
#         print("Label: ", validation_dataframe.Label[i])
    
print("\n\nACCURACY: ", total_correct/validation_dataframe.shape[0])

## FastText

In [None]:
import fasttext

In [None]:
data = pd.read_csv("Train.csv")
data = data.dropna()
training_df, validation_df = train_test_split(data, test_size=0.20, random_state=45)

def create_file(df, filename):
    file = open(filename +'.txt', "w")

    for index, row in df.iterrows():
        label = row['Label'].replace("_", "-")
        text = row['Text'].replace('\r', '').replace('\n', '')
        text = tokenizer(text)

        line = '\n__label__' + label + ' ' + text
        file.write(line)

    file.close()
    
create_file(training_df, 'fasttext-train')
create_file(validation_df, 'fasttext-val')

In [None]:
model = fasttext.train_supervised(input="fasttext-train.txt", autotuneValidationFile='fasttext-val.txt')                            

In [None]:
total_correct = 0
preds = []
for index, row in data.iterrows():
    text = tokenizer(row['Text'].replace('\r', '').replace('\n', ''))
    text = row['Text'].replace('\r', '').replace('\n', '')
    label = row['Label'].replace("_", "-")
    pred = model.predict(text)[0][0][9:]
 
    
    if label == pred:
        total_correct += 1
#         print("\n\nPhrase: ", text)
#         print("Prediction: ", pred)
#         print("Label: ", label)
    
#     else:
#         print("\n\nPhrase: ", text)
#         print("Prediction: ", pred)
#         print("Label: ", label)

    pred = pred.replace('-', '_')
    preds.append(pred)
        
print("\n\nACCURACY: ", total_correct/data.shape[0])

## Model Parameter Optimization

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("Train.csv")
df = df.dropna()
df['Text'] = df['Text'].apply(tokenizer)

In [None]:
def Vectorize(vec, X_train, X_test):    
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    
    print('Vectorization complete.\n')
    return X_train_vec, X_test_vec


def ML_modeling(models, params, X_train, X_test, y_train, y_test):    
    
    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():
    
        model = models[key]
        param = params[key]
        gs = RandomizedSearchCV(model, param, cv=5, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        
        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Precision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (precision_score(y_test, y_pred, average='macro'), 
                                                                    recall_score(y_test, y_pred, average='macro'), 
                                                                    f1_score(y_test, y_pred, average='macro')))
    
models = {
    'Model': RandomForestClassifier()
}

params = {
    'Model': { 
           'n_estimators': [200, 300, 400, 500],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,5,6,7,8],
            'criterion' :['gini', 'entropy']
       },
}

# Encode label categories to numbers
enc = LabelEncoder()
df['Label'] = enc.fit_transform(df['Label'])
labels = list(enc.classes_)

# Train-test split and vectorize
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.2)
X_train_vec, X_test_vec = Vectorize(TfidfVectorizer(ngram_range=(1,3)), X_train, X_test)

ML_modeling(models, params, X_train_vec, X_test_vec, y_train, y_test)

In [None]:
params = {
    "combined_features__bow__tfidf__use_idf": [True, False],
    "combined_features__bow__tfidf__ngram_range": [(1, 1), (1, 2)],
    "classifier__bootstrap": [True, False],
    "classifier__class_weight": ["balanced", None],
    "classifier__n_estimators": [100, 300, 500, 800, 1200],
    "classifier__max_depth": [5, 8, 15, 25, 30],
    "classifier__min_samples_split": [2, 5, 10, 15, 100],
    "classifier__min_samples_leaf": [1, 2, 5, 10]
}
search = RandomizedSearchCV(pipe, params)
search.fit(x_train, y_train)
y_pred = search.predict(x_test)
classification_report(y_test, y_pred)

## Keras

In [None]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import warnings
warnings.filterwarnings('ignore')

In [None]:
EMBEDDING_FILE = 'models/glove.6B.300d.txt'

In [None]:
max_features = 700
maxlen = 70
embed_size = 300
threshold = 0.35

In [None]:
keras_tokenizer = text.Tokenizer(num_words=max_features)
keras_tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = keras_tokenizer.texts_to_sequences(x_train)
x_test = keras_tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [None]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))

In [None]:
word_index = keras_tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [None]:
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > threshold).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
filter_sizes = [1,2,3,5]
num_filters = 42

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
#    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), 
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

In [None]:
batch_size = 256
epochs = 2

F1_Score = F1Evaluation(validation_data=(x_test, y_test), interval=1)

hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
                 validation_data=(x_test, y_test),
                 callbacks=[F1_Score], verbose=2)

## KTrain

### Training Predictor

In [None]:
import ktrain
from ktrain import text

MODEL_NAME = 'roberta-large' 
t = text.Transformer(MODEL_NAME, classes=classes)
trn = t.preprocess_train(x_train.values, y_train.values)
val = t.preprocess_test(x_test.values, y_test.values)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)
learner.fit_onecycle(5e-5, 6)

In [None]:
learner.validate()

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
predictions = predictor.predict(x_test.values)
print("Accuracy:", metrics.accuracy_score(y_true=y_test.values, y_pred=predictions, normalize=False))
print(metrics.classification_report(y_true=y_test.values, y_pred=predictions))

In [None]:
email_preds = predictor.predict(test_emails_df['Email'].values)
print("Accuracy:", metrics.accuracy_score(y_true=test_emails_df['Label'].values, y_pred=email_preds, normalize=False))
print(metrics.classification_report(y_true=test_emails_df['Label'].values, y_pred=email_preds))

In [None]:
predictions = predictor.predict(df['Text'].values)
print("Accuracy:", metrics.accuracy_score(y_true=df['Label'].values, y_pred=predictions ,normalize=False))
print(metrics.classification_report(y_true=df['Label'].values, y_pred=predictions))

In [None]:
predictor.save('/content/gdrive/My Drive/roberta-v4')

### Loading Predictor

In [None]:
predictor = ktrain.load_predictor('roberta-v4')

In [None]:
roberta_predictions = predictor.predict_proba(x_test.values)
robert_predictions_labels = predictor.predict(x_test.values)

In [None]:
get_accuracy(robert_predictions_labels, y_test.values)

In [None]:
le = preprocessing.LabelEncoder()
le.fit(classes)

### Creating Ensemble 

In [None]:
from mlxtend.classifier import EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=[pipe, predictor], weights=[1, 1], voting='soft', refit=False)
ensemble.fit(x_train, y_train)
ensmbl_preds = ensemble.predict(x_test.values)

In [None]:
get_accuracy(ensmbl_preds, y_test.values)

In [None]:
index = 9
actual_index = index - 3
text = x_test.iloc[actual_index]
print("Text: ", text)

roberta_pred_prob = max(roberta_predictions[actual_index])
roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob)])[0]

print("\nRoberta prediction:", roberta_pred, "\nProbability:", roberta_pred_prob)


svm_pred_prob = max(predicted[actual_index])
svm_pred = le.inverse_transform([np.argmax(svm_pred_prob)])[0]

print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)

### Comparing SVM and RoBERTa

In [None]:
avg_svm = 0
avg_roberta = 0
instances = 0

dataset = x_train
labels = y_train

for i in range(0, dataset.shape[0]):
    text = dataset.iloc[i]
#     print("Text: ", text)

    roberta_pred_prob_arr = predictor.predict_proba([text])[0] #roberta_predictions[i]
    robera_pred_prob = max(roberta_pred_prob_arr)
    roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob_arr)])[0]
    avg_roberta += robera_pred_prob
    
#     print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)

    svm_pred_prob_arr = pipe.predict_proba([text])[0] # predicted[i]
    svm_pred_prob = max(svm_pred_prob_arr)
    svm_pred = le.inverse_transform([np.argmax(svm_pred_prob_arr)])[0]
    avg_svm += svm_pred_prob

#     print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)

#     SVM correct, Roberta wrong
    if (svm_pred != roberta_pred and labels.iloc[i] == svm_pred):
        
#     Roberta correct, SVM wrong
#     if (svm_pred != roberta_pred and labels.iloc[i] == roberta_pred):

#     Both models wrong
#     if (svm_pred != labels.iloc[i] and roberta_pred != labels.iloc[i]):
        print("Text: ", text)
        print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)
        print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)

        print(colored('\nMismatch', 'red', attrs=['bold']))
        print(colored('Label: ' + str(labels.iloc[i]), 'green'))
#         print(colored('Ensemble: ' + str(ensmbl_preds[i]), 'blue'))
        instances += 1
        
        print("--" * 20)
    
print(colored("\nAverage prediction accuracy (SVM): " + str(avg_svm/dataset.shape[0]), 'green', attrs=['bold']))
print(colored("Average prediction accuracy (Roberta): " + str(avg_roberta/dataset.shape[0]), 'green', attrs=['bold']))
print(colored("Total Instances: " + str(instances), 'blue', attrs=['bold']))

### Manually Scaling Probabilities

In [None]:
wrong = 0
correct = 0

dataset = x_test
labels = y_test

for i in range(0, dataset.shape[0]):
    print("--" * 20)
    
    label = labels.iloc[i]
    text = dataset.iloc[i]
    
    roberta_pred_prob_arr = predictor.predict_proba([text])[0] 
    robera_pred_prob = max(roberta_pred_prob_arr)
    roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob_arr)])[0]
    roberta_vals = list(roberta_pred_prob_arr)
    roberta_vals.append(np.argmax(roberta_pred_prob_arr))
    
    svm_pred_prob_arr = pipe.predict_proba([text])[0] 
    svm_pred_prob = max(svm_pred_prob_arr)
    svm_pred = le.inverse_transform([np.argmax(svm_pred_prob_arr)])[0]
    svm_vals = list(svm_pred_prob_arr)
    svm_vals.append(np.argmax(svm_pred_prob_arr))
    
    pred_row = roberta_vals + svm_vals
    lr_pred = lr_out.predict([pred_row])
    lr_pred_label = le.inverse_transform([lr_pred])
        
    if (lr_pred_label != ensmbl_preds[i]):
#         print(colored("Correct", "green", attrs=['bold']))
#         correct += 1
#     else:
        print(colored('Mismatch', 'red', attrs=['bold']))
        wrong += 1
        
        print(colored('\nText: ' + text, 'magenta'))
        print(colored('Label: ' + label, 'blue'))
        print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)
        print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)
        print("\nEnsemble prediction:", ensmbl_preds[i])
        print("LR prediction:", lr_pred_label[0])


print(colored("\nTotal Wrong: " + str(wrong), 'red', attrs=['bold']))
print(colored("Total Correct: " + str(correct), 'green', attrs=['bold']))

### Preparing dataset for LRegression

In [None]:
lr_x_train = []
lr_y_train = []

dataset = x_train
labels = y_train

for i in range(0, dataset.shape[0]):
    label = labels.iloc[i]
    label_num = le.transform([label])[0]
    text = dataset.iloc[i]
    
    roberta_pred_prob_arr = predictor.predict_proba([text])[0] 
    roberta_pred = np.argmax(roberta_pred_prob_arr)
    roberta_vals = list(roberta_pred_prob_arr)
    roberta_vals.append(roberta_pred)
    
    svm_pred_prob_arr = pipe.predict_proba([text])[0]
    svm_pred = np.argmax(svm_pred_prob_arr)
    svm_vals = list(svm_pred_prob_arr)
    svm_vals.append(svm_pred)
    
    lr_x_train.append(roberta_vals + svm_vals)
    lr_y_train.append(label_num)
   

### Training LRegression Model

In [None]:
lr_x_train_pd = pd.DataFrame(lr_x_train)
lr_y_train_pd = pd.DataFrame(lr_y_train)


lr_out = LogisticRegression()
lr_out.fit(lr_x_train_pd, lr_y_train_pd)