In [1]:
import pandas as pd
import json
phrasebank_path = "./raw_data/Financial phrasebank.csv"
df_finphrasebank = pd.read_csv(phrasebank_path, header = None)
df_finphrasebank.columns = ["label", "text"]
df_finphrasebank.label = df_finphrasebank.label.replace(['positive','neutral', 'negative'],[1, 0, -1])
df_finphrasebank.text = df_finphrasebank.text.astype("str")

In [5]:
def extract_score(text_list):
    label = text_list[0]["sentiment_score"]
    return label

In [13]:
df_post_train = pd.read_json("./raw_data/FiQA Task 1_post_train.json", orient = "index")
df_hl_train = pd.read_json("./raw_data/FiQA Task 1_headline_train.json", orient = "index")
df_fiqa = pd.concat([df_post_train, df_hl_train], ignore_index = True)
df_fiqa["label"] = df_fiqa["info"].apply(extract_score)
df_fiqa.columns = ["text", "_", "label"]
df_fiqa = df_fiqa[["label", "text"]]

In [23]:
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
import nltk

#create global variables for data cleaning
#punctuation that going to remove
p = string.punctuation 
keep_p = ["!", "\"", "#", "&", "\'", ",", ".", ":", ";", "?", "@"] # These is to keep/ useful to remove speical pattern later
for i in keep_p:
    p = p.replace(i, "") 

#speical word to be removed
sp_word = ["^rt$", "^#.*","^@.*", "^http.*", "^www\..*", "^&.*", "^;d$", "^;D$", "^;p$", "^;P$", "^:d$", "^:D$",
           "^:p$", "^:P$", "^xd$", "^xp$", "^XD$", "^XP$", "^xD$", "^xP$", "^Xd$", "^Xp$"]

def data_cleaning(sentence):
    
    # react with special pattern discovered in pervious step
    sentence = re.sub('!+', "!", sentence)
    sentence = re.sub('\"+', "\"", sentence)
    sentence = re.sub('#+', " #", sentence)
    sentence = re.sub('&+', " &", sentence)
    sentence = re.sub('\'+', "'", sentence)
    sentence = re.sub(' \'+', "'", sentence)
    sentence = re.sub(',+', ",", sentence)
    sentence = re.sub('\.+', ".", sentence)
    sentence = re.sub(':+', ":", sentence)
    sentence = re.sub(';+', ";", sentence)
    sentence = re.sub('\?+', "?", sentence)
    sentence = re.sub('@+', " @", sentence)
    sentence = re.sub('(\?!)+', "?!", sentence)
    sentence = re.sub('http', " http", sentence)
    sentence = re.sub('www\.', " www.", sentence)
    
    # Filter out the word did't in the punctuation removal list
    sentence = "".join([j for j in sentence if j not in p]) 
    
    # remove words in special pattern lists
    words = sentence.split()
    for i in range(len(words)):
        for j in sp_word:
            words[i] = re.sub(j, "", words[i])
    
    # join back the sentence
    cleaned_text = " ".join([k for k in words])
    
    # remove duplicate space
    cleaned_text = re.sub(' +', " ", cleaned_text)
    return cleaned_text 

lemmatizer = WordNetLemmatizer()

#stop word to be removed
stop_words = stopwords.words('english')
keep_w = ['against', 'up', 'down', 'over', 'under', 'no', 'not', 'same', 'above', 'below', 'only']
for i in keep_w:
     stop_words.remove(i) #inplace function

def nltk_tag_to_wordnet_tag(nltk_tag):
    '''
    This function convert nltk POS tagging to wordnet tagging. 
    Will be used inside data_lemmatization function
    input: 
        nltk_tag: the nltk_tag that tagged for a particular word with nltk package
    output:
        wordnet tagging or NONE if no corresponding tagging found
    '''
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def data_preprocessing(sentence):
    
    sentence = sentence.lower()
    words = sentence.split()
    words = [w for w in words if w not in stop_words]
    sentence = " ".join([k for k in words])
    
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # create tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    
    lemmatized_sentence = []
    
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token without changing
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
            
    return " ".join(lemmatized_sentence)

In [24]:
df_finphrasebank.text = df_finphrasebank.text.apply(data_cleaning)
df_finphrasebank.text = df_finphrasebank.text.apply(data_preprocessing)

In [26]:
df_fiqa.text = df_fiqa.text.apply(data_cleaning)
df_fiqa.text = df_fiqa.text.apply(data_preprocessing)