In [1]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from scipy.special import softmax
%matplotlib inline

In [2]:
TEST_DF_READY = True

In [3]:
if not TEST_DF_READY:
    import os
    os.environ["CUDA_VISIBLE_DEVICES"]="6"
    
    from transformers import pipeline
    from sentence_transformers import SentenceTransformer, util

# Data

In [4]:
train = pd.read_csv('hugface_sentiment_sent_sim_embs_train.csv') if TEST_DF_READY else pd.read_csv('/data/nlp/train.csv')
test = pd.read_csv('hugface_sentiment_sent_sim_embs_test.csv') if TEST_DF_READY else pd.read_csv('/data/nlp/test.csv')
sample_sub = pd.read_csv('/data/nlp/sample_submission.csv')

# Hugging face pipe

In [5]:
def convert_pipe_out_to_df(pipe_out, postfix="", pos_label="POSITIVE"):
    for idx, l_d in enumerate(pipe_out[0]):
        print(l_d)
        if l_d['label'] == pos_label:
            positive_idx = idx 
            
    return pd.DataFrame({postfix + 'pos_prob': [el[positive_idx]['score'] for el in pipe_out]})    

In [6]:
if not TEST_DF_READY:

    CLASSIFIER = pipeline('sentiment-analysis', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="POSITIVE", postfix="trans_baseline")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="POSITIVE", postfix="trans_baseline")
    ], axis=1)
    
    CLASSIFIER = pipeline('sentiment-analysis', model='siebert/sentiment-roberta-large-english', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="POSITIVE", postfix="siebert_sentiment_roberta_large_english")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="POSITIVE", postfix="siebert_sentiment_roberta_large_english")
    ], axis=1)
    
    CLASSIFIER = pipeline('sentiment-analysis', model='cardiffnlp/twitter-xlm-roberta-base-sentiment', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="Positive", postfix="cardiffnlp_twitter_xlm_roberta_base_sentiment")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="Positive", postfix="cardiffnlp_twitter_xlm_roberta_base_sentiment")
    ], axis=1)
    
    CLASSIFIER = pipeline('sentiment-analysis', model='moussaKam/barthez-sentiment-classification', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="Positive", postfix="moussaKam_barthez_sentiment_classification")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="Positive", postfix="moussaKam_barthez_sentiment_classification")
    ], axis=1)
    
    CLASSIFIER = pipeline('sentiment-analysis', model='rohanrajpal/bert-base-multilingual-codemixed-cased-sentiment', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="LABEL_2", postfix="rohanrajpal_bert_base_multilingual_codemixed-cased_sentiment")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="LABEL_2", postfix="rohanrajpal_bert_base_multilingual_codemixed-cased_sentiment")
    ], axis=1)
    
    CLASSIFIER = pipeline('sentiment-analysis', model='abhishek/autonlp-imdb_sentiment_classification-31154', return_all_scores = True)
    train = pd.concat([
        train,
        convert_pipe_out_to_df(CLASSIFIER(train['email_body'].tolist()), pos_label="1", postfix="abhishek_autonlp_imdb_sentiment_classification_31154")
    ], axis=1)
    test = pd.concat([
        test,
        convert_pipe_out_to_df(CLASSIFIER(test['email_body'].tolist()), pos_label="1", postfix="abhishek_autonlp_imdb_sentiment_classification_31154")
    ], axis=1)

{'label': 'NEGATIVE', 'score': 0.2978530824184418}
{'label': 'POSITIVE', 'score': 0.7021468877792358}
{'label': 'NEGATIVE', 'score': 0.2978561222553253}
{'label': 'POSITIVE', 'score': 0.7021438479423523}
{'label': 'NEGATIVE', 'score': 0.9954701662063599}
{'label': 'POSITIVE', 'score': 0.004529833327978849}
{'label': 'NEGATIVE', 'score': 0.9954701662063599}
{'label': 'POSITIVE', 'score': 0.004529851954430342}
{'label': 'Negative', 'score': 0.2841259241104126}
{'label': 'Neutral', 'score': 0.65042644739151}
{'label': 'Positive', 'score': 0.06544766575098038}
{'label': 'Negative', 'score': 0.2841259241104126}
{'label': 'Neutral', 'score': 0.65042644739151}
{'label': 'Positive', 'score': 0.06544767320156097}
{'label': 'Negative', 'score': 0.007281470112502575}
{'label': 'Positive', 'score': 0.992718517780304}
{'label': 'Negative', 'score': 0.007281450089067221}
{'label': 'Positive', 'score': 0.9927185773849487}
{'label': 'LABEL_0', 'score': 0.5602739453315735}
{'label': 'LABEL_1', 'score':

# Sentence Embeddings

In [7]:
if not TEST_DF_READY:

    SENT_EMBED = SentenceTransformer('paraphrase-mpnet-base-v2', device='cuda')
    train_emb = SENT_EMBED.encode(train.email_body)
    test_emb = SENT_EMBED.encode(test.email_body)
    test['emb_sim_2'] = util.cos_sim(train_emb[0], test_emb)[0,:]
    test['emb_sim_3'] = util.cos_sim(train_emb[1], test_emb)[0,:]
    test['emb_sim_4'] = util.cos_sim(train_emb[2], test_emb)[0,:]
    test['emb_sim_5'] = util.cos_sim(train_emb[3], test_emb)[0,:]
    test['emb_sim_1'] = util.cos_sim(train_emb[4], test_emb)[0,:]
    test['cos_softmax_sum'] = (softmax(test[['emb_sim_1', 'emb_sim_2', 'emb_sim_3', 'emb_sim_4', 'emb_sim_5',]].values, axis=1) * np.array([[1,2,3,4,5]])).sum(axis=1)

    test.to_csv('hugface_sentiment_sent_sim_embs_test.csv', index=False)
    train.to_csv('hugface_sentiment_sent_sim_embs_train.csv', index=False)

In [8]:
test.head()

Unnamed: 0,id,email_body,trans_baselinepos_prob,siebert_sentiment_roberta_large_englishpos_prob,cardiffnlp_twitter_xlm_roberta_base_sentimentpos_prob,moussaKam_barthez_sentiment_classificationpos_prob,rohanrajpal_bert_base_multilingual_codemixed-cased_sentimentpos_prob,abhishek_autonlp_imdb_sentiment_classification_31154pos_prob,emb_sim_2,emb_sim_3,emb_sim_4,emb_sim_5,emb_sim_1,cos_softmax_sum
0,0,"Hi Don,\n\n \n\n4pm does not work unfortunatel...",0.702144,0.00453,0.065448,0.992719,0.195682,0.975545,1.0,0.287977,0.167633,0.18279,0.13623,2.811827
1,1,"Sorry, Greg. I’ve been buried this week. I’m d...",0.013214,0.002396,0.094949,4.8e-05,0.301675,0.99276,0.287956,1.0,0.213383,0.180342,0.133079,3.001932
2,2,"Hey Paul,\n\n \n\nThanks for the time on the c...",0.024301,0.996394,0.282276,0.002769,0.05334,0.996837,0.167671,0.213388,1.0,0.238051,0.299402,3.177293
3,3,"Hi,\n\n \n\nIncluding one of my Admins who wor...",0.971037,0.995294,0.092082,0.999201,0.052842,0.992043,0.10831,0.086292,0.372413,0.078116,0.111501,3.044992
4,4,"Hello Momin,\n\n \n\nTo follow up on our last ...",0.042077,0.99727,0.20274,0.99899,0.10759,0.989911,0.243042,0.099824,0.464646,0.133173,0.1742,3.034998


In [9]:
train.head()

Unnamed: 0,sentiment,email_body,trans_baselinepos_prob,siebert_sentiment_roberta_large_englishpos_prob,cardiffnlp_twitter_xlm_roberta_base_sentimentpos_prob,moussaKam_barthez_sentiment_classificationpos_prob,rohanrajpal_bert_base_multilingual_codemixed-cased_sentimentpos_prob,abhishek_autonlp_imdb_sentiment_classification_31154pos_prob
0,2,"Hi Don,\n\n \n\n4pm does not work unfortunatel...",0.702147,0.00453,0.065448,0.992719,0.195682,0.975545
1,3,"Sorry, Greg. I’ve been buried this week. I’m d...",0.013214,0.002396,0.094949,4.8e-05,0.301676,0.99276
2,4,"Hey Paul,\n\n \n\nThanks for the time on the c...",0.024301,0.996394,0.282276,0.002769,0.05334,0.996837
3,5,Perfect - I see it.\n\nRunning a test now.\n,0.999503,0.99613,0.841748,0.988856,0.970645,0.994174
4,1,We are not using this and did not authorize re...,0.010381,0.000649,0.04125,0.000562,0.085989,0.987938
