## Imports

In [1]:
import pandas as pd
import numpy as np
import string
import random
from termcolor import colored
from collections import defaultdict 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
import skipthoughts
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
import spacy
import unidecode
from sklearn import preprocessing
import ktrain
from ktrain import text

## Tokenizer

In [12]:
tok_nlp = spacy.load('en_core_web_sm')

def tokenizer(sentence):
    sentence = unidecode.unidecode(sentence.lower())
    mytokens = []
    sentence_len = len(sentence.split(' '))
    
    for token in tok_nlp(sentence):
        if (
            not token.is_stop and 
            not token.is_punct and
#             not token.pos_ == 'PROPN' and 
            not token.is_space
        ):
            mytokens.append(token.lemma_.strip())
    
    return ' '.join(mytokens)

In [19]:
def tokenizer(sentence):
    sentence = unidecode.unidecode(sentence.lower())
    mytokens = []
    for token in tok_nlp(sentence):
        if (not token.is_punct and not token.is_space):
            lemma = token.lemma_
            if (lemma == '-PRON-'):
                mytokens.append(token.text.strip())
            else:
                mytokens.append(token.lemma_.strip())
    
    return ' '.join(mytokens)

In [20]:
tokenizer("how are you doing apple?")

'how be you do apple'

## Helper Fuctions

In [3]:
def get_accuracy(preds, labels):
    print("Accuracy:", metrics.accuracy_score(y_true=labels, y_pred=preds ,normalize=False))
    print(metrics.classification_report(y_true=labels, y_pred=preds))
    
def save_model_results(preds):
    pd.DataFrame(preds).to_excel("temp.xlsx")

## Reading Train and Test Data

**Random_State = 45**

In [4]:
df = pd.read_excel('Training Phrases.xlsx', sheet_name='Testing of Models', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]
df['Text'] = df['Text'].apply(tokenizer)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

test_emails_df = pd.read_excel('Training Phrases.xlsx', sheet_name='Manual Test cases', nrows=23)
test_emails_df = test_emails_df.drop(test_emails_df.index[0])
test_emails_df.columns = ["Label", "Email"]
test_emails_df['Email'] = test_emails_df['Email'].apply(tokenizer)
test_emails_df['Label'] = test_emails_df['Label'].apply(lambda label: '_'.join(label.lower().split(' ')))

In [5]:
classes = df.Label.unique()
print("Total classes: ", len(classes))
df.Label.value_counts()

Total classes:  21


job_alerts                    45
howto_apply                   40
application_status            31
job_close_date                30
salary                        30
multiple_role                 27
feedback                      26
assessment_campatilibility    24
interview_reschedule          23
disability                    22
reinstate_application         19
job_account_issue             19
assessment_timebox            16
assessment_link_problem       16
age_limit                     16
cv_past_experience            15
interview_response_time       13
late_for_interview            13
work_experience               11
assessment_validity           10
special_needs_at_work          9
Name: Label, dtype: int64

## Reformatting Data to Reduce Classes

In [33]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "assessment" in value:
        new_df = new_df.append({'Label': "assessment"}, ignore_index=True)
    elif "interview" in value:
        new_df = new_df.append({'Label': "interview"}, ignore_index=True)
    elif (
        "job" in value or 
        value == "cv_past_experience" or
        value == "application_status" or
        value == "howto_apply" or
        value == "feedback" or
        value == "multiple_role" or
        value == "reinstate_application"
    ):
        new_df = new_df.append({'Label': "job_application"}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "job_details"}, ignore_index=True)
        
new_df.to_excel("4 Classes.xlsx")

### Assessment Train Data

In [71]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "assessment" in value:
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Assessment Train.xlsx")

### Interview Train Data

In [72]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if "interview" in value:
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Interview Train.xlsx")

### Job Details Train Data

In [74]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if (
        "job" in value or 
        value == "cv_past_experience" or
        value == "application_status" or
        value == "howto_apply" or
        value == "feedback" or
        value == "multiple_role" or
        value == "reinstate_application"
    ):
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
 
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Job Detail Train.xlsx")

### Job Application Train Data

In [75]:
new_df = pd.DataFrame()

for _, value in df['Label'].items():
    if (
        value == "salary" or
        value == "work_experience" or
        value == "age_limit" or
        value == "special_needs_at_work" or
        value == "disability"
    ):
        new_df = new_df.append({'Label': value}, ignore_index=True)
    else:
        new_df = new_df.append({'Label': "other"}, ignore_index=True)
        
to_write = pd.concat([df['Text'].reset_index(drop=True), new_df['Label'].reset_index(drop=True)], axis=1)
to_write.to_excel("Job Application Train.xlsx")

## To-Do

1. Remove salutations from emails
1. Random Forests
1. Ensure word embeddings are correct
1. Test and Train accuracies should be similar to avoid overfitting
1. Reduce the test set

## Sklearn

### Saving Model Output In Excel File

In [43]:
# Generating mapping column
a = pd.DataFrame(preds)
b = pd.DataFrame(df['Label'])
a.columns = ['Label']
b = b.reset_index(drop=True)
a = a.reset_index(drop=True)

(a==b).to_excel("temp.xlsx")

In [16]:
pd.DataFrame(robert_predictions_labels).to_excel("preds.xlsx")

### Training and Testing

#### Multi Stage Models

In [76]:
df = pd.read_excel('Training Phrases - 4 Classes.xlsx', sheet_name='Testing of Models', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]
df['Text'] = df['Text'].apply(tokenizer)

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

outer_pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

outer_pipe.fit(x_train, y_train)
predicted = outer_pipe.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

Accuracy: 84
                 precision    recall  f1-score   support

     assessment       0.91      0.91      0.91        11
      interview       1.00      0.88      0.93        16
job_application       0.88      0.98      0.92        44
    job_details       1.00      0.85      0.92        20

       accuracy                           0.92        91
      macro avg       0.95      0.90      0.92        91
   weighted avg       0.93      0.92      0.92        91



In [None]:
total_correct = 0

for i in range(0, x_test.shape[0]):
    v = x_test.iloc[i]
    main_class = outer_pipe.predict([v])
    pred = None
    
    if (main_class == 'job_application'):
        pred = job_app_model.predict([v])[0]
    
    elif (main_class == 'interview'):
        pred = interview_model.predict([v])[0]
    
    elif (main_class == 'job_details'):
        pred = job_detail_model.predict([v])[0]
    
    elif (main_class == 'assessment'):
        pred = assessment_model.predict([v])[0]
    
    if y_test.iloc[i] == pred:
        total_correct += 1
    else:
        print ("\nConfused:", v, "\nActual:", y_test.iloc[i], "\nPredicted:", pred)
      
print("\nACCURACY: ", total_correct / x_test.shape[0])

In [77]:
df = pd.read_excel('Assessment Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

assessment_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

assessment_model.fit(x_train, y_train)
predicted = assessment_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

Accuracy: 87
                            precision    recall  f1-score   support

assessment_campatilibility       0.67      0.50      0.57         4
   assessment_link_problem       1.00      0.50      0.67         2
        assessment_timebox       0.67      1.00      0.80         2
       assessment_validity       1.00      0.50      0.67         2
                     other       0.98      1.00      0.99        81

                  accuracy                           0.96        91
                 macro avg       0.86      0.70      0.74        91
              weighted avg       0.96      0.96      0.95        91



In [78]:
df = pd.read_excel('Interview Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

interview_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

interview_model.fit(x_train, y_train)
predicted = interview_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

Accuracy: 86
                         precision    recall  f1-score   support

   interview_reschedule       0.86      0.60      0.71        10
interview_response_time       1.00      1.00      1.00         3
     late_for_interview       0.75      1.00      0.86         3
                  other       0.96      0.99      0.97        75

               accuracy                           0.95        91
              macro avg       0.89      0.90      0.88        91
           weighted avg       0.94      0.95      0.94        91



In [79]:
df = pd.read_excel('Job Application Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

job_app_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

job_app_model.fit(x_train, y_train)
predicted = job_app_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

Accuracy: 86
                       precision    recall  f1-score   support

            age_limit       0.67      0.67      0.67         3
           disability       0.83      0.71      0.77         7
                other       0.96      1.00      0.98        70
               salary       1.00      0.86      0.92         7
special_needs_at_work       1.00      0.50      0.67         2
      work_experience       1.00      1.00      1.00         2

             accuracy                           0.95        91
            macro avg       0.91      0.79      0.83        91
         weighted avg       0.94      0.95      0.94        91



In [80]:
df = pd.read_excel('Job Detail Train.xlsx', nrows=456)
df = df.drop(df.index[0])
df = df[df.columns[[1, 2]]]
df.columns = ["Text", "Label"]

x_train, x_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.20)
x_test = x_test.sort_index()
y_test = y_test.sort_index()

job_detail_model = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True))
    ]
)

job_detail_model.fit(x_train, y_train)
predicted = job_detail_model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=predicted, normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=predicted))

Accuracy: 76
                       precision    recall  f1-score   support

   application_status       0.62      0.83      0.71         6
   cv_past_experience       0.50      1.00      0.67         1
             feedback       0.67      0.29      0.40         7
          howto_apply       0.67      0.50      0.57         8
    job_account_issue       1.00      1.00      1.00         1
           job_alerts       0.80      0.89      0.84         9
       job_close_date       1.00      1.00      1.00         5
        multiple_role       0.62      0.83      0.71         6
                other       0.94      0.94      0.94        47
reinstate_application       1.00      1.00      1.00         1

             accuracy                           0.84        91
            macro avg       0.78      0.83      0.78        91
         weighted avg       0.84      0.84      0.83        91



#### Single Model

In [6]:
classifier = SVC(C=150, gamma=0.02, probability=True)

pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ('tfid',  TfidfVectorizer(ngram_range=(1, 3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", SVC(C=150, gamma=0.02, probability=True, class_weight='balanced'))
    ])

pipe.fit(x_train, y_train)
predicted = pipe.predict_proba(x_test)

print("Accuracy:", metrics.accuracy_score(y_true=y_test, y_pred=pipe.predict(x_test), normalize=False))
print(metrics.classification_report(y_true=y_test, y_pred=pipe.predict(x_test)))

NameError: name 'SpacyVectorTransformer' is not defined

#### Testing on Emails

1. SVC = 6/21
1. roberta-untok = 11/21
1. roberta-large = 9/21
1. roberta-v4 = 13/21

In [18]:
email_preds = predictor.predict(test_emails_df['Email'].values)
print("Accuracy:", metrics.accuracy_score(y_true=test_emails_df['Label'].values, y_pred=email_preds, normalize=False))
print(metrics.classification_report(y_true=test_emails_df['Label'].values, y_pred=email_preds))

Accuracy: 13
                            precision    recall  f1-score   support

                 age_limit       1.00      1.00      1.00         1
        application_status       1.00      1.00      1.00         1
  assessment_campatibility       0.00      0.00      0.00         1
assessment_campatilibility       0.00      0.00      0.00         0
           assessment_link       0.00      0.00      0.00         1
       assessment_time_box       0.00      0.00      0.00         1
        assessment_timebox       0.00      0.00      0.00         0
       assessment_validity       1.00      1.00      1.00         1
        cv_past_experience       1.00      1.00      1.00         1
                disability       1.00      1.00      1.00         1
                  feedback       1.00      1.00      1.00         1
              how_to_apply       0.00      0.00      0.00         1
               howto_apply       0.00      0.00      0.00         0
      interview_reschedule       1

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


### Extensive Model Comparison

In [68]:
total_a = 0
total_b = 0

total_iters = 10
for i in range(0, total_iters):
    print("\nIteration: ", i)
    df = pd.read_excel('Training Phrases.xlsx', sheet_name='Testing of Models', nrows=456)
    df = df.drop(df.index[0])
    df = df[df.columns[[1, 2]]]
    df.columns = ["Text", "Label"]
    df['Text'] = df['Text'].apply(tokenizer)
    x_train, x_test, y_train, y_test = train_test_split(df["Text"], df["Label"], random_state=i, test_size=0.2)
    
#     vectorizer = TfidfVectorizer(ngram_range=(1,3)) 
#     classifier = SVC(C=150, gamma=0.02, probability=True)
#     pipe = Pipeline([('vectorizer', vectorizer),
#                      ('classifier', classifier)])
                     

#     pipe.fit(x_train, y_train)
    predicted = ensemble.predict(x_test.values)
    accuracy_a = metrics.accuracy_score(y_test, predicted)
    total_a += accuracy_a
    print("Accuracy A:", accuracy_a)
    
#     vectorizer = TfidfVectorizer(ngram_range=(1,3)) 
    classifier = SVC(C=150, gamma=0.02, probability=True)
    pipe = Pipeline(
    steps=[
        ("combined_features", FeatureUnion(
                transformer_list=[
                    ("tfid", TfidfVectorizer(ngram_range=(1,3))),
                    ("embed", SpacyVectorTransformer(nlp)),
                ]
        )),
        ("classifier", classifier),
        ]
    )
    pipe.fit(x_train, y_train)
    predicted = pipe.predict(x_test)
    
    accuracy_b = metrics.accuracy_score(y_test, predicted)
    total_b += accuracy_b
    print("Accuracy B:", accuracy_b)

print("\nAverage Accuracy A:", total_a/total_iters)
print("Average Accuracy B:", total_b/total_iters)


Iteration:  0


Accuracy A: 0.945054945054945
Accuracy B: 0.8021978021978022

Iteration:  1


Accuracy A: 0.9340659340659341
Accuracy B: 0.8021978021978022

Iteration:  2


Accuracy A: 0.945054945054945
Accuracy B: 0.7912087912087912

Iteration:  3


KeyboardInterrupt: 

### spaCy Vectorizer

In [None]:
import spacy 
from sklearn.base import BaseEstimator, TransformerMixin
nlp = spacy.load("en_core_web_lg")  

class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp

    def fit(self, X, y):
        return self

    def transform(self, X):
        return [self.nlp(text).vector for text in X]

### Embeddings Vectorizer

In [17]:
class GloveTransformer(TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):    
        return [len([tok for tok in tok_nlp(doc) if tok.pos_ == u'VERB']) for doc in X]
    
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    
    
glove_transformer = GloveTransformer()

## spaCy 

In [48]:
def create_cat_dict(label):
    mydict = {}
    for class_ in classes:
        mydict[class_] = (class_ == label)
        
    return mydict


def get_preds(texts, labels, output=False):
    total_correct = 0
    wrong_preds = defaultdict(int)
    preds = []
    
    for test_phrase in zip(texts, labels):
        phrase = test_phrase[0]
        label = [key for (key, value) in test_phrase[1].items() if value == True][0]
        pred_cats = nlp(phrase).cats
        pred = max(pred_cats, key=pred_cats.get)
        preds.append(pred)
        
        if (label == pred):
            total_correct += 1
            
        else:
            wrong_preds[label] += 1
            
            if (output):
                print("\n", "-"*15, "\nPhrase:", phrase)
                print("Label:", label)
                print("Prediction:", pred)
        
    return (total_correct, wrong_preds, preds)


def load_data(split=0.80, tok=False):
    if tok:
        df['tuples'] = df.apply(lambda row: (tokenizer(row['Text']), row['Label']), axis=1)
    else:
        df['tuples'] = df.apply(lambda row: (row['Text'], row['Label']), axis=1)
    
    train_data = df['tuples'].tolist()
#     random.shuffle(train_data)
    texts, labels = zip(*train_data)
    cats = [create_cat_dict(y) for y in labels]
    split = int(len(train_data) * split)
    
#     return (texts[:split], cats[:split]), (texts[split:], cats[split:])
    return (texts, cats)

In [50]:
# (train_texts, train_cats), (dev_texts, dev_cats) = load_data(tok=True)
(texts, cats) = load_data(tok=True)

1                           (sign job alert, job_alerts)
2                 (tell job alert regularly, job_alerts)
3                 (want subscribe job alert, job_alerts)
4      (want receive new job notification email sign,...
5              (learn late job opening boot, job_alerts)
                             ...                        
451              (day care child, special_needs_at_work)
452    (animal stay working hour, special_needs_at_work)
453                        (care, special_needs_at_work)
454    (information workplace accommodation, special_...
455    (hi look apply role store wheelchair let know ...
Name: tuples, Length: 455, dtype: object
('sign job alert', 'tell job alert regularly', 'want subscribe job alert', 'want receive new job notification email sign', 'learn late job opening boot', 'want know late job', 'sign new job opening', 'email alert', 'weekly email job alert', 'monthly email job alert', '', 'job particular job title', 'job alert sector wise', 'mentio

In [108]:
from __future__ import unicode_literals, print_function
import plac
import random
from pathlib import Path

from spacy.util import minibatch, compounding


# Parameters
n_iter = 40
drop = 0.15
architecture = "simple_cnn"


# for i in range(0,5):
(train_texts, train_cats), (dev_texts, dev_cats) = load_data_v2()

nlp = spacy.blank("en")  
textcat = nlp.create_pipe(
    "textcat", config={"exclusive_classes": True, "architecture": architecture}
)
nlp.add_pipe(textcat, last=True)

for label in classes:
    textcat.add_label(label)

train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
with nlp.disable_pipes(*other_pipes):  
    optimizer = nlp.begin_training()
    print("Training the model...")
    batch_sizes = compounding(4.0, 32.0, 1.001)
    for i in range(n_iter):
        losses = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_sizes)
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=drop, losses=losses)

# Model Results

train_total_correct, train_inaccuracies, train_preds = get_preds(train_texts, train_cats)
test_total_correct, test_inaccuracies, test_preds = get_preds(dev_texts, dev_cats)

total_correct = test_total_correct + train_total_correct
total_rows = len(dev_texts) + len(train_texts)

print("\n[TEST SET RESULTS]\n",
      "   \nTotal Correct:", test_total_correct, 
      "   \nTotal Wrong:", len(dev_texts)-test_total_correct, 
      "   \nTEST-SET ACCURACY: ", test_total_correct/len(dev_texts),
      "   \nIncorrect Predictions:\n ", test_inaccuracies
     )

print("\n[TRAIN SET RESULTS]\n",
      "   \nTotal Correct:", train_total_correct, 
      "   \nTotal Wrong:", len(train_texts)-train_total_correct, 
      "   \nTRAIN-SET ACCURACY: ", train_total_correct/len(train_texts),
      "   \nIncorrect Predictions:\n ", train_inaccuracies
     )

print("\n[OVERALL RESULTS]\n",
      "   \nTotal Correct:", total_correct, 
      "   \nTotal Wrong:", total_rows-total_correct,
      "   \nOVERALL ACCURACY: ", total_correct/total_rows)



[TEST SET RESULTS]
    
Total Correct: 60    
Total Wrong: 31    
TEST-SET ACCURACY:  0.6593406593406593    
Incorrect Predictions:
  defaultdict(<class 'int'>, {'job_alerts': 3, 'howto_apply': 2, 'multiple_role': 1, 'cv_past_experience': 1, 'application_status': 1, 'feedback': 5, 'assessment_link_problem': 2, 'assessment_campatilibility': 1, 'assessment_validity': 2, 'disability': 2, 'interview_reschedule': 3, 'late_for_interview': 2, 'age_limit': 1, 'salary': 2, 'special_needs_at_work': 3})

[TRAIN SET RESULTS]
    
Total Correct: 364    
Total Wrong: 0    
TRAIN-SET ACCURACY:  1.0    
Incorrect Predictions:
  defaultdict(<class 'int'>, {})

[OVERALL RESULTS]
    
Total Correct: 424    
Total Wrong: 31    
OVERALL ACCURACY:  0.9318681318681319


## Data Augmentation

In [5]:
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
fast_aug = naw.WordEmbsAug(
    model_type='fasttext', model_path='./models/wiki-news-300d-1M.vec',
    action="substitute")

w2c_aug = naw.WordEmbsAug(
    model_type='word2vec', model_path='./models/GoogleNews-vectors-negative300.bin',
    action="substitute")

text = "What is the process of applying for this job?"
print("Original:", text)
print("Augmented (fasttext):", fast_aug.augment(text) )
print("Augmented (w2v):", w2c_aug.augment(text) )

In [9]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

ImportError: Missed torch, transformers libraries. Install it via `pip install torch transformers`

## Ludwig

In [79]:
from ludwig.api import LudwigModel
import logging

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



**model_definition = {'input_features': [{'name': 'Text', 'type': 'text'}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}]}**
                    
 56%
 
 **model_definition = {'input_features': [{'name': 'Text', 'type': 'text', 'encoder': 'rnn'}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}],
                    'training': {'epochs': 25}}**
                    
 30%
 
 **model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}],
                    'training': {'epochs': 25}}**
                    
48%

**model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}]}**
                    
51%

In [106]:
data = pd.read_csv("Train.csv")
data = data.dropna()
df['Text'] = df['Text'].apply(tokenizer)

data.Label.value_counts()
training_dataframe, validation_dataframe = train_test_split(data,
                                                      test_size=0.2,  
                                                      random_state=42
                                                      )
validation_dataframe.reset_index(inplace=True)

model_definition = {'input_features': [{'name': 'Text', 'type': 'text', "level": "word", "dropout": True}], 
                    'output_features': [{'name': 'Label', 'type': 'category'}], 'training': {'epochs':50}}

model = LudwigModel(model_definition)

training_stats = model.train(training_dataframe, logging_level=logging.INFO)

In [107]:
predictions_dataframe = model.predict(validation_dataframe)
total_correct = 0

for i in range(0, validation_dataframe.shape[0]):
    
    if predictions_dataframe.Label_predictions[i] == validation_dataframe.Label[i]:
        total_correct += 1
#     else:
#         print("\n\nPhrase: ", validation_dataframe.Text[i])
#         print("Prediction: ", predictions_dataframe.Label_predictions[i])
#         print("Label: ", validation_dataframe.Label[i])
    
print("\n\nACCURACY: ", total_correct/validation_dataframe.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessing_parameters['fill_value'],




ACCURACY:  0.5164835164835165


## FastText

In [178]:
import fasttext

In [214]:
data = pd.read_csv("Train.csv")
data = data.dropna()
training_df, validation_df = train_test_split(data, test_size=0.20, random_state=45)

def create_file(df, filename):
    file = open(filename +'.txt', "w")

    for index, row in df.iterrows():
        label = row['Label'].replace("_", "-")
        text = row['Text'].replace('\r', '').replace('\n', '')
        text = tokenizer(text)

        line = '\n__label__' + label + ' ' + text
        file.write(line)

    file.close()
    
create_file(training_df, 'fasttext-train')
create_file(validation_df, 'fasttext-val')

In [174]:
model = fasttext.train_supervised(input="fasttext-train.txt", autotuneValidationFile='fasttext-val.txt')                            

In [217]:
total_correct = 0
preds = []
for index, row in data.iterrows():
    text = tokenizer(row['Text'].replace('\r', '').replace('\n', ''))
    text = row['Text'].replace('\r', '').replace('\n', '')
    label = row['Label'].replace("_", "-")
    pred = model.predict(text)[0][0][9:]
 
    
    if label == pred:
        total_correct += 1
#         print("\n\nPhrase: ", text)
#         print("Prediction: ", pred)
#         print("Label: ", label)
    
#     else:
#         print("\n\nPhrase: ", text)
#         print("Prediction: ", pred)
#         print("Label: ", label)

    pred = pred.replace('-', '_')
    preds.append(pred)
        
print("\n\nACCURACY: ", total_correct/data.shape[0])



ACCURACY:  0.9164835164835164


## Model Parameter Optimization

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [41]:
df = pd.read_csv("Train.csv")
df = df.dropna()
df['Text'] = df['Text'].apply(tokenizer)

In [60]:
def Vectorize(vec, X_train, X_test):    
    X_train_vec = vec.fit_transform(X_train)
    X_test_vec = vec.transform(X_test)
    
    print('Vectorization complete.\n')
    return X_train_vec, X_test_vec


def ML_modeling(models, params, X_train, X_test, y_train, y_test):    
    
    if not set(models.keys()).issubset(set(params.keys())):
        raise ValueError('Some estimators are missing parameters')

    for key in models.keys():
    
        model = models[key]
        param = params[key]
        gs = RandomizedSearchCV(model, param, cv=5, error_score=0, refit=True)
        gs.fit(X_train, y_train)
        y_pred = gs.predict(X_test)
        
        # Print scores for the classifier
        print(key, ':', gs.best_params_)
        print("Precision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (precision_score(y_test, y_pred, average='macro'), 
                                                                    recall_score(y_test, y_pred, average='macro'), 
                                                                    f1_score(y_test, y_pred, average='macro')))
    
models = {
    'Model': RandomForestClassifier()
}

params = {
    'Model': { 
           'n_estimators': [200, 300, 400, 500],
            'max_features': ['auto', 'sqrt', 'log2'],
            'max_depth' : [4,5,6,7,8],
            'criterion' :['gini', 'entropy']
       },
}

# Encode label categories to numbers
enc = LabelEncoder()
df['Label'] = enc.fit_transform(df['Label'])
labels = list(enc.classes_)

# Train-test split and vectorize
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Label'], random_state=45, test_size=0.2)
X_train_vec, X_test_vec = Vectorize(TfidfVectorizer(ngram_range=(1,3)), X_train, X_test)

ML_modeling(models, params, X_train_vec, X_test_vec, y_train, y_test)

Vectorization complete.

Model : {'n_estimators': 500, 'max_features': 'sqrt', 'max_depth': 8, 'criterion': 'entropy'}
Precision: 0.615 	Recall: 0.568 		F1: 0.543



In [172]:
params = {
    "combined_features__bow__tfidf__use_idf": [True, False],
    "combined_features__bow__tfidf__ngram_range": [(1, 1), (1, 2)],
    "classifier__bootstrap": [True, False],
    "classifier__class_weight": ["balanced", None],
    "classifier__n_estimators": [100, 300, 500, 800, 1200],
    "classifier__max_depth": [5, 8, 15, 25, 30],
    "classifier__min_samples_split": [2, 5, 10, 15, 100],
    "classifier__min_samples_leaf": [1, 2, 5, 10]
}
search = RandomizedSearchCV(pipe, params)
search.fit(x_train, y_train)
y_pred = search.predict(x_test)
classification_report(y_test, y_pred)

ValueError: Invalid parameter bow for estimator FeatureUnion(transformer_list=[('tfid', TfidfVectorizer(ngram_range=(1, 3))),
                               ('embed',
                                SpacyVectorTransformer(nlp=<spacy.lang.en.English object at 0x0000026866BBFE48>))]). Check the list of available parameters with `estimator.get_params().keys()`.

## Keras

In [22]:
import numpy as np
np.random.seed(42)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from keras.models import Model
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import warnings
warnings.filterwarnings('ignore')

In [75]:
EMBEDDING_FILE = 'models/glove.6B.300d.txt'

In [76]:
max_features = 700
maxlen = 70
embed_size = 300
threshold = 0.35

In [77]:
keras_tokenizer = text.Tokenizer(num_words=max_features)
keras_tokenizer.fit_on_texts(list(x_train) + list(x_test))
x_train = keras_tokenizer.texts_to_sequences(x_train)
x_test = keras_tokenizer.texts_to_sequences(x_test)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

In [78]:
def get_coefs(word, *arr): 
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf8"))

In [79]:
word_index = keras_tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix[i] = embedding_vector

In [80]:
class F1Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            y_pred = (y_pred > threshold).astype(int)
            score = f1_score(self.y_val, y_pred)
            print("\n F1 Score - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [81]:
filter_sizes = [1,2,3,5]
num_filters = 42

def get_model():    
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
#    x = SpatialDropout1D(0.4)(x)
    x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embed_size), 
                                 kernel_initializer='he_normal', activation='tanh')(x)
    conv_3 = Conv2D(num_filters, kernel_size=(filter_sizes[3], embed_size),
                                 kernel_initializer='he_normal', activation='tanh')(x)
    
    maxpool_0 = MaxPool2D(pool_size=(maxlen - filter_sizes[0] + 1, 1))(conv_0)
    maxpool_1 = MaxPool2D(pool_size=(maxlen - filter_sizes[1] + 1, 1))(conv_1)
    maxpool_2 = MaxPool2D(pool_size=(maxlen - filter_sizes[2] + 1, 1))(conv_2)
    maxpool_3 = MaxPool2D(pool_size=(maxlen - filter_sizes[3] + 1, 1))(conv_3)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3])   
    z = Flatten()(z)
    z = Dropout(0.1)(z)
        
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model

model = get_model()

In [82]:
batch_size = 256
epochs = 2

F1_Score = F1Evaluation(validation_data=(x_test, y_test), interval=1)

hist = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs,
                 validation_data=(x_test, y_test),
                 callbacks=[F1_Score], verbose=2)


Train on 364 samples, validate on 91 samples
Epoch 1/2


ValueError: could not convert string to float: 'assessment_link_problem'

## KTrain

### Training Predictor

In [27]:
import ktrain
from ktrain import text

MODEL_NAME = 'roberta-large' 
t = text.Transformer(MODEL_NAME, classes=classes)
trn = t.preprocess_train(x_train.values, y_train.values)
val = t.preprocess_test(x_test.values, y_test.values)
model = t.get_classifier()
learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=16)
learner.fit_onecycle(5e-5, 6)



preprocessing train...
language: en
train sequence lengths:
	mean : 7
	95percentile : 21
	99percentile : 41




Is Multi-Label? False
preprocessing test...
language: en
test sequence lengths:
	mean : 6
	95percentile : 20
	99percentile : 26




begin training using onecycle policy with max lr of 5e-05...
Train for 23 steps, validate for 3 steps
Epoch 1/6
 1/23 [>.............................] - ETA: 1:02:40

KeyboardInterrupt: 

In [None]:
learner.validate()

In [None]:
predictor = ktrain.get_predictor(learner.model, preproc=t)

In [None]:
predictions = predictor.predict(x_test.values)
print("Accuracy:", metrics.accuracy_score(y_true=y_test.values, y_pred=predictions, normalize=False))
print(metrics.classification_report(y_true=y_test.values, y_pred=predictions))

In [None]:
email_preds = predictor.predict(test_emails_df['Email'].values)
print("Accuracy:", metrics.accuracy_score(y_true=test_emails_df['Label'].values, y_pred=email_preds, normalize=False))
print(metrics.classification_report(y_true=test_emails_df['Label'].values, y_pred=email_preds))

In [None]:
predictions = predictor.predict(df['Text'].values)
print("Accuracy:", metrics.accuracy_score(y_true=df['Label'].values, y_pred=predictions ,normalize=False))
print(metrics.classification_report(y_true=df['Label'].values, y_pred=predictions))

In [None]:
predictor.save('/content/gdrive/My Drive/roberta-v4')

### Loading Predictor

In [8]:
predictor = ktrain.load_predictor('roberta-v4')

In [9]:
roberta_predictions = predictor.predict_proba(x_test.values)
robert_predictions_labels = predictor.predict(x_test.values)

In [10]:
get_accuracy(robert_predictions_labels, y_test.values)

Accuracy: 86
                            precision    recall  f1-score   support

                 age_limit       1.00      1.00      1.00         3
        application_status       1.00      1.00      1.00         6
assessment_campatilibility       0.75      0.75      0.75         4
   assessment_link_problem       0.50      0.50      0.50         2
        assessment_timebox       0.67      1.00      0.80         2
       assessment_validity       1.00      1.00      1.00         3
        cv_past_experience       1.00      0.50      0.67         2
                disability       0.86      1.00      0.92         6
                  feedback       1.00      0.86      0.92         7
               howto_apply       0.89      1.00      0.94         8
      interview_reschedule       1.00      1.00      1.00        11
   interview_response_time       1.00      1.00      1.00         2
         job_account_issue       1.00      1.00      1.00         1
                job_alerts       1

In [20]:
le = preprocessing.LabelEncoder()
le.fit(classes)

LabelEncoder()

### Creating Ensemble 

In [25]:
from mlxtend.classifier import EnsembleVoteClassifier
ensemble = EnsembleVoteClassifier(clfs=[pipe, predictor], weights=[1, 1], voting='soft', refit=False)
ensemble.fit(x_train, y_train)
ensmbl_preds = ensemble.predict(x_test.values)

In [26]:
get_accuracy(ensmbl_preds, y_test.values)

Accuracy: 83
                            precision    recall  f1-score   support

                 age_limit       1.00      1.00      1.00         3
        application_status       1.00      1.00      1.00         6
assessment_campatilibility       0.75      0.75      0.75         4
   assessment_link_problem       0.50      0.50      0.50         2
        assessment_timebox       0.67      1.00      0.80         2
       assessment_validity       1.00      0.67      0.80         3
        cv_past_experience       1.00      0.50      0.67         2
                disability       0.86      1.00      0.92         6
                  feedback       1.00      0.86      0.92         7
               howto_apply       0.89      1.00      0.94         8
      interview_reschedule       1.00      0.91      0.95        11
   interview_response_time       1.00      1.00      1.00         2
         job_account_issue       1.00      1.00      1.00         1
                job_alerts       0

In [132]:
index = 9
actual_index = index - 3
text = x_test.iloc[actual_index]
print("Text: ", text)

roberta_pred_prob = max(roberta_predictions[actual_index])
roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob)])[0]

print("\nRoberta prediction:", roberta_pred, "\nProbability:", roberta_pred_prob)


svm_pred_prob = max(predicted[actual_index])
svm_pred = le.inverse_transform([np.argmax(svm_pred_prob)])[0]

print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)

Text:  alert late marketing job

Roberta prediction: age_limit 
Probability: 0.793149

SVM prediction: age_limit 
Probability: 0.8518229933856651


### Comparing SVM and RoBERTa

In [21]:
avg_svm = 0
avg_roberta = 0
instances = 0

dataset = x_train
labels = y_train

for i in range(0, dataset.shape[0]):
    text = dataset.iloc[i]
#     print("Text: ", text)

    roberta_pred_prob_arr = predictor.predict_proba([text])[0] #roberta_predictions[i]
    robera_pred_prob = max(roberta_pred_prob_arr)
    roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob_arr)])[0]
    avg_roberta += robera_pred_prob
    
#     print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)

    svm_pred_prob_arr = pipe.predict_proba([text])[0] # predicted[i]
    svm_pred_prob = max(svm_pred_prob_arr)
    svm_pred = le.inverse_transform([np.argmax(svm_pred_prob_arr)])[0]
    avg_svm += svm_pred_prob

#     print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)

#     SVM correct, Roberta wrong
    if (svm_pred != roberta_pred and labels.iloc[i] == svm_pred):
        
#     Roberta correct, SVM wrong
#     if (svm_pred != roberta_pred and labels.iloc[i] == roberta_pred):

#     Both models wrong
#     if (svm_pred != labels.iloc[i] and roberta_pred != labels.iloc[i]):
        print("Text: ", text)
        print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)
        print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)

        print(colored('\nMismatch', 'red', attrs=['bold']))
        print(colored('Label: ' + str(labels.iloc[i]), 'green'))
#         print(colored('Ensemble: ' + str(ensmbl_preds[i]), 'blue'))
        instances += 1
        
        print("--" * 20)
    
print(colored("\nAverage prediction accuracy (SVM): " + str(avg_svm/dataset.shape[0]), 'green', attrs=['bold']))
print(colored("Average prediction accuracy (Roberta): " + str(avg_roberta/dataset.shape[0]), 'green', attrs=['bold']))
print(colored("Total Instances: " + str(instances), 'blue', attrs=['bold']))

Text:  mention experience application update

SVM prediction: cv_past_experience 
Probability: 0.6049715173961667

Roberta prediction: howto_apply 
Probability: 0.29405546
[1m[31m
Mismatch[0m
[32mLabel: cv_past_experience[0m
----------------------------------------


Text:  apply

SVM prediction: howto_apply 
Probability: 0.784910511198526

Roberta prediction: assessment_campatilibility 
Probability: 0.18758306
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  know job application submit successfully

SVM prediction: application_status 
Probability: 0.6888856918773756

Roberta prediction: howto_apply 
Probability: 0.91993284
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  assessment valid want apply job reuse complete earlier

SVM prediction: assessment_validity 
Probability: 0.4953576396038414

Roberta prediction: howto_apply 
Probability: 0.53755
[1m[31m
Mismatch[0m
[32mLabel: assessment_validity[0m
----------------------------------------


Text:  date submission require document

SVM prediction: job_close_date 
Probability: 0.7070563089989045

Roberta prediction: howto_apply 
Probability: 0.84486455
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  status application

SVM prediction: application_status 
Probability: 0.7912697144818865

Roberta prediction: howto_apply 
Probability: 0.91328233
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  able time interview

SVM prediction: late_for_interview 
Probability: 0.6240291681604133

Roberta prediction: interview_reschedule 
Probability: 0.8472291
[1m[31m
Mismatch[0m
[32mLabel: late_for_interview[0m
----------------------------------------


Text:  experience match need role

SVM prediction: work_experience 
Probability: 0.4925508924280532

Roberta prediction: multiple_role 
Probability: 0.23997861
[1m[31m
Mismatch[0m
[32mLabel: work_experience[0m
----------------------------------------


Text:  reinstate application withdraw earlier

SVM prediction: reinstate_application 
Probability: 0.6930584053869224

Roberta prediction: howto_apply 
Probability: 0.36473137
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  long assessment valid need assessment time

SVM prediction: assessment_validity 
Probability: 0.5396625970257306

Roberta prediction: assessment_timebox 
Probability: 0.6883633
[1m[31m
Mismatch[0m
[32mLabel: assessment_validity[0m
----------------------------------------


Text:  provide detail interview

SVM prediction: feedback 
Probability: 0.6735533260406231

Roberta prediction: interview_reschedule 
Probability: 0.33298102
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  good afternoon candidate recently apply job boot trainee pharmacy advisor wonder position fill complete assessment hear kind regard

SVM prediction: application_status 
Probability: 0.6270943887428426

Roberta prediction: assessment_link_problem 
Probability: 0.45864245
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  old need work boot

SVM prediction: age_limit 
Probability: 0.6165578898178463

Roberta prediction: special_needs_at_work 
Probability: 0.3153813
[1m[31m
Mismatch[0m
[32mLabel: age_limit[0m
----------------------------------------


Text:  resume application

SVM prediction: reinstate_application 
Probability: 0.8405603287212952

Roberta prediction: howto_apply 
Probability: 0.9384005
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  basis disqualify interview

SVM prediction: feedback 
Probability: 0.6851728388763988

Roberta prediction: interview_reschedule 
Probability: 0.64610916
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  link assessment work help

SVM prediction: assessment_link_problem 
Probability: 0.6187246039945402

Roberta prediction: assessment_campatilibility 
Probability: 0.5094612
[1m[31m
Mismatch[0m
[32mLabel: assessment_link_problem[0m
----------------------------------------


Text:  long review process

SVM prediction: application_status 
Probability: 0.6069816947624332

Roberta prediction: assessment_link_problem 
Probability: 0.52605486
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  application check

SVM prediction: application_status 
Probability: 0.7148273209636792

Roberta prediction: assessment_link_problem 
Probability: 0.30416974
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  apply time instead application go reserve list boot different career opportunity apply opportunity learn job love apply role boot interview candidate apply role mean need information get reject interview apply role boot immediately wait know online assessment wait 12 month reapply kindly let know

SVM prediction: multiple_role 
Probability: 0.48294894252243614

Roberta prediction: assessment_link_problem 
Probability: 0.40093058
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  apply position time

SVM prediction: multiple_role 
Probability: 0.6758533447062952

Roberta prediction: assessment_timebox 
Probability: 0.73154753
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  time invest question assessment

SVM prediction: assessment_timebox 
Probability: 0.5550852483177483

Roberta prediction: assessment_link_problem 
Probability: 0.5561627
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  know application receive

SVM prediction: application_status 
Probability: 0.7481522922024632

Roberta prediction: howto_apply 
Probability: 0.91951174
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  hello hope email find submit online assessment evening wonder hear sorry know probably busy interested position 10 year retail experience company think fit job look forward hear kind regard

SVM prediction: application_status 
Probability: 0.6411416856517765

Roberta prediction: assessment_campatilibility 
Probability: 0.5888129
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  hear

SVM prediction: interview_response_time 
Probability: 0.7086920627289571

Roberta prediction: assessment_campatilibility 
Probability: 0.15683314
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  reinstate application assessment exam prior 1 2019

SVM prediction: reinstate_application 
Probability: 0.5607086858671477

Roberta prediction: assessment_validity 
Probability: 0.2995733
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  need complete assessment new job interested

SVM prediction: assessment_validity 
Probability: 0.48907291558215243

Roberta prediction: assessment_campatilibility 
Probability: 0.3244759
[1m[31m
Mismatch[0m
[32mLabel: assessment_validity[0m
----------------------------------------


Text:  interview week hear hear

SVM prediction: interview_response_time 
Probability: 0.8153798260697669

Roberta prediction: interview_reschedule 
Probability: 0.5765831
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  account accessible email find

SVM prediction: job_account_issue 
Probability: 0.6873225917491218

Roberta prediction: assessment_campatilibility 
Probability: 0.3484167
[1m[31m
Mismatch[0m
[32mLabel: job_account_issue[0m
----------------------------------------


Text:  interview day ago hear shall suppose job

SVM prediction: interview_response_time 
Probability: 0.6152484307661084

Roberta prediction: job_close_date 
Probability: 0.9339866
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  salary job

SVM prediction: salary 
Probability: 0.9751129306740156

Roberta prediction: howto_apply 
Probability: 0.32896122
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  job apply

SVM prediction: multiple_role 
Probability: 0.7305316446898749

Roberta prediction: howto_apply 
Probability: 0.9764717
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  job alert particular salary range

SVM prediction: job_alerts 
Probability: 0.7710699014699001

Roberta prediction: salary 
Probability: 0.9458366
[1m[31m
Mismatch[0m
[32mLabel: job_alerts[0m
----------------------------------------


Text:  salary post

SVM prediction: salary 
Probability: 0.9448952518078079

Roberta prediction: assessment_link_problem 
Probability: 0.6390025
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  complete online assessment time negative affect chance get job

SVM prediction: assessment_timebox 
Probability: 0.5581277413132045

Roberta prediction: assessment_campatilibility 
Probability: 0.8354414
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  interested job apply

SVM prediction: multiple_role 
Probability: 0.6606680228475814

Roberta prediction: howto_apply 
Probability: 0.9745814
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  deadline submit application

SVM prediction: job_close_date 
Probability: 0.7823459489357344

Roberta prediction: howto_apply 
Probability: 0.9717918
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  concern application

SVM prediction: feedback 
Probability: 0.7503199454154815

Roberta prediction: howto_apply 
Probability: 0.8765269
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  hello wonder specific person talk employment giltbrook store thank

SVM prediction: howto_apply 
Probability: 0.615559848433848

Roberta prediction: job_alerts 
Probability: 0.46268225
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  contact company apply multiple job

SVM prediction: application_status 
Probability: 0.6945170316340534

Roberta prediction: multiple_role 
Probability: 0.9294735
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  reinstate application

SVM prediction: reinstate_application 
Probability: 0.8847411666096305

Roberta prediction: howto_apply 
Probability: 0.28227317
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  complete online assessment week ago need apply job

SVM prediction: assessment_validity 
Probability: 0.48207706640605547

Roberta prediction: job_close_date 
Probability: 0.538608
[1m[31m
Mismatch[0m
[32mLabel: assessment_validity[0m
----------------------------------------


Text:  complete job application ask cv like share work experience history

SVM prediction: cv_past_experience 
Probability: 0.5855944658366136

Roberta prediction: howto_apply 
Probability: 0.55210924
[1m[31m
Mismatch[0m
[32mLabel: cv_past_experience[0m
----------------------------------------


Text:  job application accept reject

SVM prediction: application_status 
Probability: 0.7014084264721427

Roberta prediction: assessment_link_problem 
Probability: 0.40189755
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  long complete assessment

SVM prediction: assessment_timebox 
Probability: 0.6399378530577048

Roberta prediction: assessment_campatilibility 
Probability: 0.46111405
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  open assessment link

SVM prediction: assessment_campatilibility 
Probability: 0.7138229027244801

Roberta prediction: assessment_link_problem 
Probability: 0.8782721
[1m[31m
Mismatch[0m
[32mLabel: assessment_campatilibility[0m
----------------------------------------


Text:  link unavailable area send update assessment link

SVM prediction: assessment_link_problem 
Probability: 0.631007867109967

Roberta prediction: assessment_campatilibility 
Probability: 0.56584626
[1m[31m
Mismatch[0m
[32mLabel: assessment_link_problem[0m
----------------------------------------


Text:  accident late interview kindly set interview later day

SVM prediction: late_for_interview 
Probability: 0.6989870731290331

Roberta prediction: interview_reschedule 
Probability: 0.79965353
[1m[31m
Mismatch[0m
[32mLabel: late_for_interview[0m
----------------------------------------


Text:  deadline job application

SVM prediction: job_close_date 
Probability: 0.7886464673725897

Roberta prediction: howto_apply 
Probability: 0.9720177
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  time complete assessment

SVM prediction: assessment_timebox 
Probability: 0.7300781509053723

Roberta prediction: assessment_campatilibility 
Probability: 0.4562309
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  wonder help apply job day ago come job interested ok apply job

SVM prediction: multiple_role 
Probability: 0.5372470052638881

Roberta prediction: job_close_date 
Probability: 0.97036135
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  good afternoon apply time position gillingham store enquire store encourage apply complete online assessment status boot account say email notification send yesterday receive confirm notification kind regard

SVM prediction: feedback 
Probability: 0.6799701606213364

Roberta prediction: job_account_issue 
Probability: 0.69759595
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  interested job apply

SVM prediction: multiple_role 
Probability: 0.6606680228475814

Roberta prediction: howto_apply 
Probability: 0.9745814
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  application consider long

SVM prediction: application_status 
Probability: 0.6530168661312958

Roberta prediction: age_limit 
Probability: 0.31565624
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  good evening hope help girlfriend french national plan relocate live permanent basis currently work beauty seller look submit cv boot potential job opening apply appreciate help able provide regard

SVM prediction: job_alerts 
Probability: 0.6287428996620156

Roberta prediction: howto_apply 
Probability: 0.61537755
[1m[31m
Mismatch[0m
[32mLabel: job_alerts[0m
----------------------------------------


Text:  job close let know

SVM prediction: job_close_date 
Probability: 0.7043372729481429

Roberta prediction: application_status 
Probability: 0.3052488
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  weakness interview

SVM prediction: feedback 
Probability: 0.7693077123872756

Roberta prediction: interview_reschedule 
Probability: 0.64348644
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  offer job

SVM prediction: salary 
Probability: 0.7515781838350752

Roberta prediction: howto_apply 
Probability: 0.3773832
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  hour complete online assessment

SVM prediction: assessment_timebox 
Probability: 0.6674807537306618

Roberta prediction: assessment_campatilibility 
Probability: 0.84588134
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  eligible role apply

SVM prediction: multiple_role 
Probability: 0.6967185909316128

Roberta prediction: howto_apply 
Probability: 0.97335285
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  online link apply

SVM prediction: howto_apply 
Probability: 0.7157502458830443

Roberta prediction: assessment_link_problem 
Probability: 0.61055124
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  salary range accord qualification

SVM prediction: salary 
Probability: 0.7642168655923622

Roberta prediction: age_limit 
Probability: 0.85307384
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  dear hope complete decision make exercise let know mind let know inthis process look forward hear thank

SVM prediction: application_status 
Probability: 0.6385214695279017

Roberta prediction: assessment_campatilibility 
Probability: 0.25149488
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  qualified role apply

SVM prediction: multiple_role 
Probability: 0.6665004547434867

Roberta prediction: howto_apply 
Probability: 0.9726605
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  deadline job application

SVM prediction: job_close_date 
Probability: 0.7886464673725897

Roberta prediction: howto_apply 
Probability: 0.9720177
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  submission time application vacancy

SVM prediction: job_close_date 
Probability: 0.7347961344006465

Roberta prediction: job_alerts 
Probability: 0.8544048
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  salary benefit get job

SVM prediction: salary 
Probability: 0.8457244898554193

Roberta prediction: disability 
Probability: 0.7967384
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  know job

SVM prediction: application_status 
Probability: 0.6876267737209713

Roberta prediction: howto_apply 
Probability: 0.28601113
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  find link attach cv

SVM prediction: cv_past_experience 
Probability: 0.6703543153584414

Roberta prediction: howto_apply 
Probability: 0.45843682
[1m[31m
Mismatch[0m
[32mLabel: cv_past_experience[0m
----------------------------------------


Text:  day apply job find job description

SVM prediction: job_close_date 
Probability: 0.7319576572325199

Roberta prediction: howto_apply 
Probability: 0.9016556
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  concern request chance continue application retail assistant boot retail able complete online assessment phone smash access email know complete online assessment give cv boot store feel like perfect candidate job work boot know role regulation

SVM prediction: reinstate_application 
Probability: 0.5590896671725857

Roberta prediction: assessment_campatilibility 
Probability: 0.72951853
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  reapply job

SVM prediction: howto_apply 
Probability: 0.677433273722143

Roberta prediction: job_alerts 
Probability: 0.64380044
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  write tip improve application

SVM prediction: feedback 
Probability: 0.6929111959445508

Roberta prediction: assessment_link_problem 
Probability: 0.5219992
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  good afternoon like apply trainee pharmacy adviser good customer experience work cabin crew 3 year learn fast work time cruise terminal check people cruise cruise different system learn fast look cv attachment look forward hear

SVM prediction: howto_apply 
Probability: 0.5821106939268345

Roberta prediction: assessment_campatilibility 
Probability: 0.5167222
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  long complete online assessment

SVM prediction: assessment_timebox 
Probability: 0.6563373123039963

Roberta prediction: assessment_campatilibility 
Probability: 0.859022
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  long wait follow

SVM prediction: interview_response_time 
Probability: 0.6003794023590443

Roberta prediction: application_status 
Probability: 0.79355186
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  browser compatible assessment link

SVM prediction: assessment_campatilibility 
Probability: 0.787570836186555

Roberta prediction: assessment_link_problem 
Probability: 0.79011273
[1m[31m
Mismatch[0m
[32mLabel: assessment_campatilibility[0m
----------------------------------------


Text:  apply

SVM prediction: howto_apply 
Probability: 0.784910511198526

Roberta prediction: assessment_campatilibility 
Probability: 0.18758306
[1m[31m
Mismatch[0m
[32mLabel: howto_apply[0m
----------------------------------------


Text:  want know late job

SVM prediction: job_alerts 
Probability: 0.6692872164739685

Roberta prediction: howto_apply 
Probability: 0.31041947
[1m[31m
Mismatch[0m
[32mLabel: job_alerts[0m
----------------------------------------


Text:  long apply job

SVM prediction: job_close_date 
Probability: 0.755178571553313

Roberta prediction: howto_apply 
Probability: 0.97300106
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  date apply job let know

SVM prediction: job_close_date 
Probability: 0.8026282153282528

Roberta prediction: howto_apply 
Probability: 0.9527164
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  hi get assessment tonight have work flash work office regard

SVM prediction: assessment_timebox 
Probability: 0.5516342977698473

Roberta prediction: assessment_campatilibility 
Probability: 0.7338142
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  cancel job application want apply

SVM prediction: reinstate_application 
Probability: 0.6266356670051147

Roberta prediction: howto_apply 
Probability: 0.97414964
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  long entire application process

SVM prediction: application_status 
Probability: 0.6631265389371446

Roberta prediction: howto_apply 
Probability: 0.67744225
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  disability prevent get position

SVM prediction: disability 
Probability: 0.7494822510238971

Roberta prediction: assessment_campatilibility 
Probability: 0.31112584
[1m[31m
Mismatch[0m
[32mLabel: disability[0m
----------------------------------------


Text:  soon get response application

SVM prediction: application_status 
Probability: 0.6351399099591648

Roberta prediction: howto_apply 
Probability: 0.5798864
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  day apply

SVM prediction: job_close_date 
Probability: 0.8266635610221511

Roberta prediction: howto_apply 
Probability: 0.9761263
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  long result interview

SVM prediction: interview_response_time 
Probability: 0.6298606576019224

Roberta prediction: interview_reschedule 
Probability: 0.4326209
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  run time whilst complete online assessment adverse affect job prospect

SVM prediction: assessment_timebox 
Probability: 0.5619450991490369

Roberta prediction: assessment_campatilibility 
Probability: 0.58845323
[1m[31m
Mismatch[0m
[32mLabel: assessment_timebox[0m
----------------------------------------


Text:  confirm receive complete online assessment

SVM prediction: application_status 
Probability: 0.6697595290716657

Roberta prediction: assessment_campatilibility 
Probability: 0.90856916
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  interview month job receive update interview update

SVM prediction: interview_response_time 
Probability: 0.6213521509410079

Roberta prediction: job_close_date 
Probability: 0.32866722
[1m[31m
Mismatch[0m
[32mLabel: interview_response_time[0m
----------------------------------------


Text:  consideration job

SVM prediction: application_status 
Probability: 0.6716441786921552

Roberta prediction: howto_apply 
Probability: 0.333216
[1m[31m
Mismatch[0m
[32mLabel: application_status[0m
----------------------------------------


Text:  apply ago withdraw application want apply

SVM prediction: reinstate_application 
Probability: 0.6194698246510042

Roberta prediction: howto_apply 
Probability: 0.3658606
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  link apply position

SVM prediction: multiple_role 
Probability: 0.5984584753223207

Roberta prediction: howto_apply 
Probability: 0.9309056
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  salary range accord experience

SVM prediction: salary 
Probability: 0.7349026839193571

Roberta prediction: work_experience 
Probability: 0.88252187
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------


Text:  age relaxation role

SVM prediction: age_limit 
Probability: 0.6674030536333339

Roberta prediction: special_needs_at_work 
Probability: 0.29544523
[1m[31m
Mismatch[0m
[32mLabel: age_limit[0m
----------------------------------------


Text:  try submit confirm assessment unable proceed system keep take assessment step

SVM prediction: assessment_link_problem 
Probability: 0.5608769853022652

Roberta prediction: assessment_campatilibility 
Probability: 0.6108378
[1m[31m
Mismatch[0m
[32mLabel: assessment_link_problem[0m
----------------------------------------


Text:  want know weak area cause rejection

SVM prediction: feedback 
Probability: 0.6608945686456471

Roberta prediction: assessment_link_problem 
Probability: 0.4401402
[1m[31m
Mismatch[0m
[32mLabel: feedback[0m
----------------------------------------


Text:  apply job

SVM prediction: multiple_role 
Probability: 0.7196532865481501

Roberta prediction: howto_apply 
Probability: 0.37039506
[1m[31m
Mismatch[0m
[32mLabel: multiple_role[0m
----------------------------------------


Text:  resume cv process

SVM prediction: reinstate_application 
Probability: 0.7414931033900429

Roberta prediction: howto_apply 
Probability: 0.7541433
[1m[31m
Mismatch[0m
[32mLabel: reinstate_application[0m
----------------------------------------


Text:  day apply role interested

SVM prediction: job_close_date 
Probability: 0.7126134223779059

Roberta prediction: howto_apply 
Probability: 0.84272057
[1m[31m
Mismatch[0m
[32mLabel: job_close_date[0m
----------------------------------------


Text:  write email inquire likely job opportunity boot interested work company hardworke enthusiastic come achieve goal work customer eye tiny detail prepare commit training require experience work boot refer cv moment resume attach document sincerely

SVM prediction: job_alerts 
Probability: 0.6439875157157023

Roberta prediction: howto_apply 
Probability: 0.78623664
[1m[31m
Mismatch[0m
[32mLabel: job_alerts[0m
----------------------------------------


Text:  pay probation period

SVM prediction: salary 
Probability: 0.6756080681683514

Roberta prediction: assessment_timebox 
Probability: 0.3425847
[1m[31m
Mismatch[0m
[32mLabel: salary[0m
----------------------------------------
[1m[32m
Average prediction accuracy (SVM): 0.6862536307163604[0m
[1m[32mAverage prediction accuracy (Roberta): 0.8010863111569331[0m
[1m[34mTotal Instances: 102[0m


### Manually Scaling Probabilities

In [86]:
wrong = 0
correct = 0

dataset = x_test
labels = y_test

for i in range(0, dataset.shape[0]):
    print("--" * 20)
    
    label = labels.iloc[i]
    text = dataset.iloc[i]
    
    roberta_pred_prob_arr = predictor.predict_proba([text])[0] 
    robera_pred_prob = max(roberta_pred_prob_arr)
    roberta_pred = le.inverse_transform([np.argmax(roberta_pred_prob_arr)])[0]
    roberta_vals = list(roberta_pred_prob_arr)
    roberta_vals.append(np.argmax(roberta_pred_prob_arr))
    
    svm_pred_prob_arr = pipe.predict_proba([text])[0] 
    svm_pred_prob = max(svm_pred_prob_arr)
    svm_pred = le.inverse_transform([np.argmax(svm_pred_prob_arr)])[0]
    svm_vals = list(svm_pred_prob_arr)
    svm_vals.append(np.argmax(svm_pred_prob_arr))
    
    pred_row = roberta_vals + svm_vals
    lr_pred = lr_out.predict([pred_row])
    lr_pred_label = le.inverse_transform([lr_pred])
        
    if (lr_pred_label != ensmbl_preds[i]):
#         print(colored("Correct", "green", attrs=['bold']))
#         correct += 1
#     else:
        print(colored('Mismatch', 'red', attrs=['bold']))
        wrong += 1
        
        print(colored('\nText: ' + text, 'magenta'))
        print(colored('Label: ' + label, 'blue'))
        print("\nSVM prediction:", svm_pred, "\nProbability:", svm_pred_prob)
        print("\nRoberta prediction:", roberta_pred, "\nProbability:", robera_pred_prob)
        print("\nEnsemble prediction:", ensmbl_preds[i])
        print("LR prediction:", lr_pred_label[0])


print(colored("\nTotal Wrong: " + str(wrong), 'red', attrs=['bold']))
print(colored("Total Correct: " + str(correct), 'green', attrs=['bold']))

----------------------------------------


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: send application role time[0m
[34mLabel: multiple_role[0m

SVM prediction: application_status 
Probability: 0.2998763538744035

Roberta prediction: multiple_role 
Probability: 0.91299707

Ensemble prediction: multiple_role
LR prediction: disability
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: good evening recently apply job faced counter chelmsford boot branch attach resume wander able help interested job help appreciate thank[0m
[34mLabel: cv_past_experience[0m

SVM prediction: job_alerts 
Probability: 0.3894585203840947

Roberta prediction: reinstate_application 
Probability: 0.69205695

Ensemble prediction: reinstate_application
LR prediction: job_alerts
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: expect hear boot apply position[0m
[34mLabel: application_status[0m

SVM prediction: multiple_role 
Probability: 0.21776427691809935

Roberta prediction: application_status 
Probability: 0.9741682

Ensemble prediction: application_status
LR prediction: reinstate_application
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: response reject[0m
[34mLabel: feedback[0m

SVM prediction: feedback 
Probability: 0.5691805852081928

Roberta prediction: application_status 
Probability: 0.85831714

Ensemble prediction: application_status
LR prediction: feedback
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: hello complete online questionnaire trouble fill second personality screen question tick 3 box suppose appreciate kindly enlighten issue thank[0m
[34mLabel: assessment_campatilibility[0m

SVM prediction: feedback 
Probability: 0.18342552765309525

Roberta prediction: assessment_campatilibility 
Probability: 0.6048872

Ensemble prediction: assessment_campatilibility
LR prediction: reinstate_application
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31mMismatch[0m
[35m
Text: 1 assessment position[0m
[34mLabel: assessment_validity[0m

SVM prediction: assessment_validity 
Probability: 0.1422304853638511

Roberta prediction: multiple_role 
Probability: 0.8625334

Ensemble prediction: multiple_role
LR prediction: disability
----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


----------------------------------------


  y = column_or_1d(y, warn=True)


[1m[31m
Total Wrong: 6[0m
[1m[32mTotal Correct: 0[0m


  y = column_or_1d(y, warn=True)


### Preparing dataset for LRegression

In [59]:
lr_x_train = []
lr_y_train = []

dataset = x_train
labels = y_train

for i in range(0, dataset.shape[0]):
    label = labels.iloc[i]
    label_num = le.transform([label])[0]
    text = dataset.iloc[i]
    
    roberta_pred_prob_arr = predictor.predict_proba([text])[0] 
    roberta_pred = np.argmax(roberta_pred_prob_arr)
    roberta_vals = list(roberta_pred_prob_arr)
    roberta_vals.append(roberta_pred)
    
    svm_pred_prob_arr = pipe.predict_proba([text])[0]
    svm_pred = np.argmax(svm_pred_prob_arr)
    svm_vals = list(svm_pred_prob_arr)
    svm_vals.append(svm_pred)
    
    lr_x_train.append(roberta_vals + svm_vals)
    lr_y_train.append(label_num)
   

### Training LRegression Model

In [68]:
lr_x_train_pd = pd.DataFrame(lr_x_train)
lr_y_train_pd = pd.DataFrame(lr_y_train)


lr_out = LogisticRegression()
lr_out.fit(lr_x_train_pd, lr_y_train_pd)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)