In [14]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import os
import re

In [15]:
import nltk
from nltk.tokenize import TweetTokenizer
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import gensim.models as gsm

## Set up paths

In [68]:
# Data
data_train_path = './../data/train/train.csv'
data_test_path = './../data/test/test.csv'

# Word tools
word_sentiments_intensity_path =  './../tools/word/emo_lex.csv'
words_path = './../tools/word/words.csv'
#word_affect_intensity_path = './../tools/word/word_affect_intensity.csv'
positive_words_path = './../tools/word/positive_words.txt'
negative_words_path = './../tools/word/negative_words.txt'

# Emoji tools
emoji_polarity_path = './../tools/emoji/emoji_emotion.json'
emoji_sentiments_path = './../tools/emoji/emoji_sentiments.csv'
emoji_embedding_pre_trained_model_path = './../tools/emoji/emoji2vec.bin'

# Feature extraction
## Words
### Sentiment intensity
- NRC Word-Emotion Association Lexicon (EmoLex)

In [88]:
word_sentiment_intensity_df = pd.read_csv(word_sentiments_intensity_path, delimiter='\t')
word_sentiment_intensity_df['word'] = word_sentiment_intensity_df['word'].str.lower()
word_sentiment_intensity_df.head()

Unnamed: 0,word,anger,anticipation,disgust,fear,joy,negative,positive,sadness,surprise,trust
0,true,0.0,0.0,0.0,0.0,0.328,0.0,0.0,0.0,0.0,0.0
1,aaaaaaah,0.0,0.0,0.0,0.344,0.0,0.0,0.0,0.0,0.0,0.0
2,aaaah,0.0,0.0,0.0,0.234,0.0,0.0,0.0,0.0,0.0,0.0
3,abandon,0.0,0.0,0.0,0.531,0.0,0.0,0.0,0.703,0.0,0.0
4,abandoned,0.222,0.0,0.0,0.534,0.0,0.0,0.0,0.828,0.0,0.0


- For the following sentiments, the intensity value is always equals to 0.
- We may remove these sentiment columns, because they will have no affect in the learning process.
- Create a new dataframe with the useful columns (where sentiment intensity is different to 0).

In [25]:
THRESHOLD = 100

def get_useful_columns(df, class_column='word', threshold=0):
    empty_columns = []
    for column_name in list(df.columns):
        if len(df.loc[df[column_name] != 0.]) <= threshold:
            empty_columns.append(column_name)
    useful_columns = list(set(df.columns) - set(empty_columns))
    if class_column not in useful_columns:
        useful_columns.append(class_column)
    print("Useful columns are:", useful_columns)
    return useful_columns

useful_columns = get_useful_columns(word_sentiment_intensity_df, threshold=THRESHOLD)
word_sentiment_intensity_df = word_sentiment_intensity_df[useful_columns].copy()
word_sentiment_intensity_df.head()

Useful columns are: ['anger', 'joy', 'fear', 'sadness', 'word']


Unnamed: 0,anger,joy,fear,sadness,word
0,0.0,0.328,0.0,0.0,true
1,0.0,0.0,0.344,0.0,aaaaaaah
2,0.0,0.0,0.234,0.0,aaaah
3,0.0,0.0,0.531,0.703,abandon
4,0.222,0.0,0.534,0.828,abandoned


- Create a dict of dict: {word: {sentiment: intensity, sent2: intensity, ...}

In [162]:
word_sentiment_intensity_dict = word_sentiment_intensity_df.set_index('word').T.apply(tuple).to_dict()
word_sentiment_intensity_dict['abandon']

(0.0,
 0.0,
 0.0,
 0.53100000000000003,
 0.0,
 0.0,
 0.0,
 0.70299999999999996,
 0.0,
 0.0)

#### Final function

In [163]:
def get_intensity_dict(dataset_path=word_sentiments_intensity_path,
                                      delimiter='\t', class_column='word',
                                      remove_useless_columns=True, threshold=0):
    df = pd.read_csv(dataset_path, delimiter=delimiter)
    df[class_column] = df[class_column].str.lower()
    if remove_useless_columns:
        useful_columns = get_useful_columns(df, class_column=class_column, threshold=threshold)
        df = df[useful_columns].copy()
    df_dict = df.set_index(class_column).T.apply(tuple).to_dict()
    return df_dict

word_sentiment_intensity_dict = get_intensity_dict()
word_sentiment_intensity_dict['dumbbitch']

Useful columns are: ['anger', 'joy', 'fear', 'sadness', 'word']


(0.84099999999999997, 0.0, 0.0, 0.0)

In [75]:
words_dict = get_intensity_dict(dataset_path=words_path)

Useful columns are: ['anticipation', 'anger', 'joy', 'surprise', 'fear', 'disgust', 'sadness', 'trust', 'word']


In [81]:
words = list(word_dict.keys())
words.extend(word_sentiment_intensity_dict.keys())
print(len(words), len(word_dict.keys()), len(word_sentiment_intensity_dict.keys()), len(set(words)))

21054 16862 4192 18932


## Positive Negatif words
#### Get positive and negative word list:

In [54]:
def get_file_content(filepath):
    with open(filepath, 'r') as f:
        content = f.readlines()
    return content

positive_word_list = get_file_content(positive_words_path)
negative_word_list = get_file_content(negative_words_path)
print(len(positive_word_list), len(negative_word_list))

2006 4783


## Emojis
### 1 - Emoji polarity
#### Emoji Valence (EV) 
- https://github.com/words/emoji-emotion/blob/master/index.json
Values between -4 and 4

In [61]:
def get_emoji_polarity_dict(emoji_data_path=emoji_polarity_path):
    """Returns a dict with the emoji unicode as key and the emoji polarity as value"""
    with open(emoji_polarity_path) as json_data:
        emoji_polarities = json.load(json_data)
        
    emoji_polarity_dict = {}
    for emoji_val in emoji_polarities:
        emoji_polarity_dict[emoji_val["emoji"]] = emoji_val["polarity"]
    return emoji_polarity_dict

In [63]:
emoji_polarity_dict = get_emoji_polarity_dict()
emoji_polarity_dict[emoji]
set(emoji_polarity_dict.values())

{-4, -3, -2, -1, 0, 1, 2, 3, 4}

### 2 - Emoji sentiments

In [149]:
def get_emoji_sentiment_df(sentiment_path=emoji_sentiments_path):
    emoji_sentiments_df = pd.read_csv(sentiment_path, delimiter=',')
    # Convert scores to percentages between 0 and 1
    target_columns = ['negative', 'neutral', 'positive']
    for column in target_columns:
        emoji_sentiments_df[column] /= emoji_sentiments_df['occurrences']
    # Filter columns (remove useless ones)
    target_columns.append('emoji')
    emoji_sentiments_df = emoji_sentiments_df.filter(target_columns, axis=1)
    return emoji_sentiments_df
emoji_sentiments_df = get_emoji_sentiment_df()
emoji_sentiments_df.head()

Unnamed: 0,negative,neutral,positive,emoji
0,0.247162,0.284708,0.46813,😂
1,0.044099,0.165714,0.790186,❤
2,0.035274,0.271837,0.692889,♥
3,0.051738,0.218588,0.729674,😍
4,0.436482,0.220413,0.343105,😭


## Combine the polarity and sentiments in one dict

In [185]:
def get_emoji_dict(polarity_path=emoji_polarity_path, sentiment_path=emoji_sentiments_path):
    polarity_dict = get_emoji_polarity_dict(polarity_path)
    sentiment_df = get_emoji_sentiment_df(sentiment_path)
    # Add new column for popularity with a default value 0
    sentiment_df['polarity'] = 0
    # Fill with the polarity values! (878 emojis with a missing polarity value!)
    for emoji, polarity in emoji_polarity_dict.items():
        sentiment_df.loc[sentiment_df['emoji'] == emoji, 'polarity'] = polarity
    # Convert to dict
    sentiment_dict = sentiment_df.set_index('emoji').T.apply(tuple).to_dict()
    return sentiment_dict

emoji_sentiment_dict = get_emoji_dict(emoji_polarity_path, emoji_sentiments_path)
# (Negative, Neutral, Positive, Polarity)
emoji_sentiments_dict['😂']

(0.24716181096977158, 0.28470797428532346, 0.46813021474490496, 3.0)

### 4 - Get feature for a word

In [208]:
def get_word_features(word, word_dict=word_dict, emoji_dict=word_dict, word_def='hello', emoji_def='😂'):
    """ If the word does not exist, we return a vector filled with 0"""
    word_nb_features = len(word_dict[word_def])
    emoji_nb_features = len(emoji_dict[emoji_def])
    feature_array = np.zeros((word_nb_features + emoji_nb_features))
    if word in word_dict:
        # Fill the first part of the array with word features
        feature_array[:word_nb_features] = word_dict[word]
    elif word in emoji_dict:
        # Fill the second part of the array with emoji features
        feature_array[emoji_nb_features+1:] = emoji_dict[word]
    return feature_array

In [210]:
word_dict = get_intensity_dict(dataset_path=words_path, threshold=3500)
emoji_dict = get_emoji_dict(emoji_polarity_path, emoji_sentiments_path)
print(get_word_features('fun', word_dict, emoji_dict))
print(get_word_features('#fun', word_dict, emoji_dict))
print(get_word_features('😂', word_dict, emoji_dict))

Useful columns are: ['anticipation', 'anger', 'surprise', 'fear', 'disgust', 'word']
[ 0.          0.          0.33503936  0.          0.          0.          0.
  0.          0.        ]
[ 0.52548359  0.          0.00436945  0.          0.          0.          0.
  0.          0.        ]
[ 0.          0.          0.          0.          0.          0.24716181
  0.28470797  0.46813021  3.        ]
