In [156]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [157]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Set up paths

In [158]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

# Word tools
word_sentiments_intensity_path =  './../tools/word/emo_lex.csv'
word_affect_intensity_path = './../tools/word/word_affect_intensity.csv'
positive_words_path = './../tools/word/positive_words.txt'
negative_words_path = './../tools/word/negative_words.txt'

# Emoji tools
emoji_polarity_path = './../tools/emoji/emoji_emotion.json'
emoji_sentiments_path = './../tools/emoji/emoji_sentiments.csv'
emoji_embedding_pre_trained_model_path = './../tools/emoji/emoji2vec.bin'

# Preprocessing

In [159]:
lemmatizer = WordNetLemmatizer()
tokenizer = TweetTokenizer()
notstopwords = set(('not', 'can', 'no'))
stopwords = set(nltk.corpus.stopwords.words('english')) - notstopwords

standarizer_dict = {
    r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?": " <url> ",
    r'(.)\1+': r"\1\1", # cooool --> cool; coool--> cool
    r"\'s": "",
    r"\'n": "", 
    r"\'m": " am", 
    r"im": " ", 
    r"\'ve": " have", 
    r"\'ve": " have", 
    r" can\'t": " cannot", 
    r"n\'t": " not", 
    r"\'re": " are", 
    r"\'d": " would", 
    r"\'ll": " will", 
    r"\.{1,1}": " ", 
    r" [-+]?[.\d]*[\d]+[:,.\d]*": "",
    r"@\w+": r'  <entity> '
}

In [160]:
def preprocess_tweet(tweet):
    tweet.replace("\\n", " ")
    # Standarize tweet
    for current_form, standared_form in standarizer_dict.items():
        tweet = re.sub(current_form, standared_form, tweet)
    # Remove stop words
    tweet = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*').sub('', tweet)
    # Lemmatization
    tweet_tokens = []
    for token, tag in pos_tag(tokenizer.tokenize(tweet)):
        if tag[0].lower() in ['a','n','v']:
            lem = lemmatizer.lemmatize(token,tag[0].lower())
        else:
            lem = lemmatizer.lemmatize(token)
        
        tweet_tokens.append(lem.lower())
    return tweet_tokens

def preprocess_features(df):
    # Get labels (0,1,2,3) and polarity (sadness, joy, ..) 
    labels = sorted(set(df['class'].tolist()))
    polarity = sorted(set(df['polarity'].tolist()))
    one_hot = np.zeros((len(labels), len(labels)), int)
    # Create onehot encoding for labels and popularities
    np.fill_diagonal(one_hot, 1)
    label_dict = dict(zip(labels, one_hot))
    polarity_dict = dict(zip(polarity, one_hot))
    return label_dict, polarity_dict

def preprocess_dataframe(df):
    # Remove suffix of class : 0: no joy can be inferred -> 0
    df['text'] = df['text'].apply(lambda x: preprocess_tweet(x)).tolist()
    df['class'] = [c.split (":")[0] for c in df['class'].tolist()]
    return df

def prepare_cvs_data(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, encoding='utf-8', quoting=3)
    df.columns = ['id','text','polarity','class'] # Set up column names
    df = df.iloc[np.random.permutation(len(df))] # Random permutations
    df = preprocess_dataframe(df)
    return df

def prepare_data(file_path):
    df = prepare_cvs_data(file_path)
    # Transform labels and polarities to onehot encoding
    label_dict, polarity_dict = preprocess_features(df)
    label = df['class'].apply(lambda y: label_dict[y]).tolist()
    polarity = df['polarity'].apply(lambda y: polarity_dict[y]).tolist()
    return df['text'], polarity, label
    

In [161]:
test_tweets, test_labels, test_polarities = prepare_data(data_test_path)
train_tweets, train_labels, train_polarities = prepare_data(data_train_path)
test_data = list(zip(test_tweets, test_polarities, test_labels))
train_data = list(zip(train_tweets, train_polarities, train_labels))

In [162]:
tweet = "😭😭 I \n think that you've a lot looool money ;) @Singaholic121 Good morning, love! Happy first day of fall. Let's make some awesome #autumnmemories #annabailey"
x = preprocess_tweet(tweet)
emoji = x[0]

# Feature extraction
## Words
### Sentiment intensity
- NRC Word-Emotion Association Lexicon (EmoLex)

In [163]:
word_sentiment_intensity_df = pd.read_csv(word_sentiments_intensity_path, delimiter='\t')
word_sentiment_intensity_df['word'] = word_sentiment_intensity_df['word'].str.lower()
word_sentiment_intensity_df.head()

Unnamed: 0,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,true,0.0,0.0,0.0,0.0,0.328,0.0,0.0,0.0,0.0,0.0
1,aaaaaaah,0.0,0.0,0.0,0.344,0.0,0.0,0.0,0.0,0.0,0.0
2,aaaah,0.0,0.0,0.0,0.234,0.0,0.0,0.0,0.0,0.0,0.0
3,abandon,0.0,0.0,0.0,0.531,0.0,0.0,0.0,0.703,0.0,0.0
4,abandoned,0.222,0.0,0.0,0.534,0.0,0.0,0.0,0.828,0.0,0.0


- For the following sentiments, the intensity value is always equals to 0.
- We may remove these sentiment columns, because they will have no affect in the learning process.
- Create a new dataframe with the useful columns (where sentiment intensity is different to 0).

In [176]:
THRESHOLD = 100

def get_useful_columns(df, class_column='word', threshold=0):
    empty_columns = []
    for column_name in list(df.columns):
        if len(df.loc[df[column_name] != 0.]) <= threshold:
            empty_columns.append(column_name)
    useful_columns = list(set(df.columns) - set(empty_columns))
    if class_column not in useful_columns:
        useful_columns.append(class_column)
    print("Useful columns are:", useful_columns)
    return useful_columns

useful_columns = get_useful_columns(word_sentiment_intensity_df, threshold=THRESHOLD)
word_sentiment_intensity_df = word_sentiment_intensity_df[useful_columns].copy()
word_sentiment_intensity_df.head()

Useful columns are: ['joy', 'sadness', 'word', 'fear', 'anger']


Unnamed: 0,joy,sadness,word,fear,anger
0,0.328,0.0,true,0.0,0.0
1,0.0,0.0,aaaaaaah,0.344,0.0
2,0.0,0.0,aaaah,0.234,0.0
3,0.0,0.703,abandon,0.531,0.0
4,0.0,0.828,abandoned,0.534,0.222


- Create a dict of dict: {word: {sentiment: intensity, sent2: intensity, ...}

In [177]:
word_sentiment_intensity_dict = word_sentiment_intensity_df.set_index('word').T.to_dict()
word_sentiment_intensity_dict['true']

{'anger': 0.0, 'fear': 0.0, 'joy': 0.32799999999999996, 'sadness': 0.0}

#### Final function
## TODO:
- check why useful columns are not working
- Convert the float precision to only 3 values -> 0.333322 -> 0.333

In [178]:
def get_intensity_dict(dataset_path=word_sentiments_intensity_path,
                                      delimiter='\t', class_column='word',
                                      remove_useless_columns=True, threshold=0):
    df = pd.read_csv(dataset_path, delimiter=delimiter)
    df[class_column] = df[class_column].str.lower()
    if remove_useless_columns:
        useful_columns = get_useful_columns(df, class_column=class_column, threshold=threshold)
        df = df[useful_columns].copy()
    df_dict = df.set_index(class_column).T.to_dict()
    return df_dict

word_sentiment_intensity_dict = get_intensity_dict()
word_sentiment_intensity_dict['true']

Useful columns are: ['joy', 'sadness', 'word', 'fear', 'anger']


{'anger': 0.0, 'fear': 0.0, 'joy': 0.32799999999999996, 'sadness': 0.0}

## Positive Negatif words
- Opinion Lexicon English (OLE)
- https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

In [214]:
word_affect_dict = get_intensity_dict(
    dataset_path=word_affect_intensity_path, class_column='term',
    remove_useless_columns=False)

new_dict = {}
for word, affects in word_affect_dict.items():
    new_dict[word] = {}
    new_dict[word][affects['AffectDimension']] = affects['score']
    
word_affect_dict = new_dict
word_affect_dict['outraged']



{'anger': 0.9640000000000001}

#### Get positive and negative word list:

In [146]:
def get_file_content(filepath):
    with open(filepath, 'r') as f:
        content = f.readlines()
    return content

positive_word_list = get_file_content(positive_words_path)
negative_word_list = get_file_content(negative_words_path)
print(len(positive_word_list), len(negative_word_list))

2006 4783


## Emojis
### 1 - Emoji polarity
#### Emoji Valence (EV) 
- https://github.com/words/emoji-emotion/blob/master/index.json

In [16]:
def get_emoji_polarity_dict(emoji_data_path=emoji_polarity_path):
    """Returns a dict with the emoji unicode as key and the emoji polarity as value"""
    with open(emoji_polarity_path) as json_data:
        emoji_polarities = json.load(json_data)
        
    emoji_polarity_dict = dict()
    for emoji_val in emoji_polarities:
        emoji_polarity_dict[emoji_val["emoji"]] = emoji_val["polarity"]
    return emoji_polarity_dict

In [17]:
emoji_polarity_dict = get_emoji_polarity_dict()
emoji_polarity_dict[emoji]

-3

### 2 - Emoji sentiments

In [179]:
df = pd.read_csv(emoji_sentiments_path, delimiter=',')
df.head()

Unnamed: 0,#Emoji,Unicode codepoint,Occurrences,Position,Negative,Neutral,Positive,Unicode name,Unicode block
0,😂,0x1f602,14622,0.805101,3614,4163,6845,FACE WITH TEARS OF JOY,Emoticons
1,❤,0x2764,8050,0.746943,355,1334,6361,HEAVY BLACK HEART,Dingbats
2,♥,0x2665,7144,0.753806,252,1942,4950,BLACK HEART SUIT,Miscellaneous Symbols
3,😍,0x1f60d,6359,0.765292,329,1390,4640,SMILING FACE WITH HEART-SHAPED EYES,Emoticons
4,😭,0x1f62d,5526,0.803352,2412,1218,1896,LOUDLY CRYING FACE,Emoticons


In [180]:
target_columns = ['#Emoji', 'Negative', 'Neutral', 'Positive']
emoji_sentiments_df = df.filter(target_columns, axis=1)
emoji_sentiments_df.head()

Unnamed: 0,#Emoji,Negative,Neutral,Positive
0,😂,3614,4163,6845
1,❤,355,1334,6361
2,♥,252,1942,4950
3,😍,329,1390,4640
4,😭,2412,1218,1896


In [181]:
emoji_sentiments_dict = dict = emoji_sentiments_df.set_index('#Emoji').T.apply(tuple).to_dict()
emoji_sentiments_dict['😂']

(3614, 4163, 6845)

### 3 - Emoji embedding
#### emoji2vec
- https://github.com/uclmr/emoji2vec/tree/master/pre-trained

In [183]:
emoji2vec = gsm.KeyedVectors.load_word2vec_format(emoji_embedding_pre_trained_model_path, binary=True)
happy_vector = emoji2vec[emoji]    # Produces an embedding vector of length 300
happy_vector

array([ 0.03221022,  0.03802984, -0.00126745,  0.07279918, -0.02769   ,
       -0.01013857,  0.0864341 , -0.01573726,  0.10124657,  0.08510233,
        0.01168873, -0.06952818, -0.0263179 ,  0.10325162, -0.00335573,
        0.04028838,  0.0177028 ,  0.06445294,  0.04212   , -0.09827866,
        0.03254349,  0.1064738 ,  0.0735938 , -0.07274383,  0.01790767,
        0.03611384,  0.02536146, -0.00573439,  0.06542501, -0.04325472,
       -0.04252771, -0.00203998,  0.03511803, -0.09883551,  0.00640196,
       -0.03072599,  0.05863559,  0.0196573 ,  0.03455479,  0.06133179,
        0.08997498, -0.1033521 ,  0.0926978 , -0.10560492,  0.01345985,
        0.01028653, -0.07663322,  0.04080227,  0.00948974,  0.09845643,
       -0.05172281,  0.05844482,  0.03093685,  0.04788001,  0.07770577,
       -0.05077488, -0.10048547,  0.00741034, -0.00987429, -0.04561597,
       -0.02553738,  0.05784023, -0.03538289, -0.10472344,  0.04968144,
        0.03641411, -0.00178819,  0.0460491 , -0.07111374,  0.05

### 4 - Create a dict for all emojis