In [1]:
# precondition: download 'stanford-corenlp-full-2018-10-05', see https://stanfordnlp.github.io/CoreNLP/index.html#download

In [2]:
# imports
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import string
import re 
import contractions

from stanfordcorenlp import StanfordCoreNLP   # see https://stanfordnlp.github.io/CoreNLP/annotators.html

In [3]:
# set paths to data and stanfordcorenlp
path = '../../data/Kaggle'
stanfordcorenlp_path = '../../stanford-corenlp-full-2018-10-05' 

In [4]:
test = pd.read_table(f'{path}/test.tsv')
sub = pd.read_csv(f'{path}/sampleSubmission.csv')

In [5]:
# clean data
def clean(df, text_field):
    df.loc[:,text_field] = df.loc[:,text_field].str.lower()   # lowercase    
    
    for i in range(df.shape[0]):
        df.loc[i, text_field] = re.sub( '\s+', ' ',  df.loc[i, text_field]).strip()   # remove duplicate whitespaces
        df.loc[i, text_field] = contractions.fix(df.loc[i, text_field])     # replace contractions
    return df

In [7]:
# test = clean(test, 'Phrase')   

In [7]:
# cannot cope with whitespaces as input, therefore replace whitespaces by "_"
def replace_whitespaces(data):
    whitespace_phraseIDs = data.loc[data['Phrase'] == " "]['PhraseId'].ravel()   # if not cleaned change "" to " "
    for i in whitespace_phraseIDs:
        data.loc[data['PhraseId'] == i, 'Phrase'] = "_"
        print(data.loc[data['PhraseId'] == i])
    return data

test = replace_whitespaces(test)

[157451]
      PhraseId  SentenceId Phrase
1390    157451        8588      _


In [8]:
def stanfordcorenlp_predict(data):
    nlp = StanfordCoreNLP(f'{stanfordcorenlp_path}')

    predicted = np.zeros(data.shape[0])
    
    for i in range(data.shape[0]):
        phrase = data.iloc[i]['Phrase']
        res = nlp.annotate(phrase, properties={
            "annotators": "tokenize,ssplit,parse,sentiment", # tokenizerAnnotator, WordsToSentencesAnnotator, ParserAnnotator, SentimentAnnotator
            "outputFormat": "json",
            # Only split the sentence at End Of Line. We assume that this method only takes in one single sentence.
            "ssplit.eolonly": "true",
            # Setting enforceRequirements to skip some annotators and make the process faster
            "enforceRequirements": "false"
        })
        predicted[i] = res[res.find("sentimentValue") + 18]   # not an optimal solution, but it works
       
    nlp.close() 
    return predicted

In [9]:
sub.Sentiment = stanfordcorenlp_predict(test)
sub.Sentiment = sub.Sentiment.astype(int)

sub.to_csv('sub_stanfordcorenlp.csv', header = True, index=False)

In [10]:
# kaggle test score without cleaning: 0.645
# kaggle test score with cleaning: 0.640 