In [21]:
import pytumblr
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import re
import nltk.data
import logging
from gensim.models import word2vec
from sklearn.kernel_approximation import RBFSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from time import time, sleep

## Tumblr data scraping

In [2]:
# Authenticate via OAuth
client = pytumblr.TumblrRestClient(
  'IfgCzO5vai28fvtoOZqhirNO4Wy4XWDEBAB0iORxfCCxcpqfky',
  'c7NCLElazxRkIUgZw00Ur5lW6qsL2SR8qmVdQ3jyd29RkoeGB3',
  '4z8Ljx4lc2YJkh5rCf6dEWAGs9A4M0Z7EQFL7jVH4AnDBeCd69',
  'JTHBUNxnSoK5VcB7nHipx7rSYbMGilW27DceHVhu9lI0z0Law1'
)

In [47]:
# 6 basic emotions: happiness, sadness, anger, disgust, fear, surprise
search_query = 'happy'
# Posts available here: https://www.tumblr.com/tagged/happy

# Angry, disgusted -> no more posts

In [48]:
t0 = time()

posts = []
# Load previously scraped data
df = pd.read_csv('./data/' + search_query + '.csv', encoding='utf-8')
# Continue scraping starting from the oldest timestamp
before = df['timestamp'].min()

# 800 requests to avoid exceeding Tumblr API's limitations
for i in range(100):
    tagged = client.tagged(search_query, filter='text', before=before)
    for elt in tagged:
        current_post = []
        current_post.append(elt['id'])
        current_post.append(elt['post_url'])

        elt_type = elt['type']
        current_post.append(elt_type)
        current_post.append(elt['timestamp'])
        current_post.append(elt['date'])
        current_post.append(elt['tags'])
        current_post.append(elt['liked'])
        current_post.append(elt['note_count'])

        if (elt_type == 'photo'):
            # Only take the first image
            current_post.append(elt['photos'][0]['original_size']['url'])
            current_post.append(elt['caption'].replace('\n',' ').replace('\r',' '))
            current_post.append(search_query)
            posts.append(current_post)
        elif (elt_type == 'text'):
            current_post.append(np.nan)
            current_post.append(elt['body'].replace('\n',' ').replace('\r',' '))
            current_post.append(search_query)
            posts.append(current_post)
            
    before = elt['timestamp']
    
print('The scraping took {0}s'.format(time() - t0))

TypeError: string indices must be integers

In [49]:
df_posts = pd.DataFrame(posts, columns=['id', 'post_url', 'type', 'timestamp', 'date',
                                        'tags', 'liked', 'note_count', 'photo', 'text', 'search_query'])

In [38]:
# Concatenate and save the new posts
df_concat = pd.concat([df, df_posts]).reset_index(drop=True)
df_concat.to_csv('./data/' + search_query + '.csv', encoding='utf-8', index=False)

## Word2vec

In [83]:
# Convert a sentence for a list of words
def sentence_to_wordlist(review):
    # Remove HTML
    review_text = BeautifulSoup(review).get_text()
    
    # Remove non-letters
    review_text = re.sub('[^a-zA-Z]', ' ', review_text)
    
    # Convert words to lower case and split them
    words = review_text.lower().split()
    
    return words

# Convert a paragraph to a list of list of words
def string_to_sentences(string, tokenizer):
    sentences = []
    # Check for nan text
    if (type(string) == float) and (np.isnan(string)):
        pass
    else:
        raw_sentences = tokenizer.tokenize(string.strip())
        for elt in raw_sentences:
            if (len(elt) > 0):
                sentences.append(sentence_to_wordlist(elt))
            
    return sentences

# Convert a paragraph to a list of words
def string_to_words_weights(string, tokenizer, vocab, vocab_weights):
    sentences = []
    # Check for nan text
    if (type(string) == float) and (np.isnan(string)):
        pass
    else:
        raw_sentences = tokenizer.tokenize(string.strip())
        for elt in raw_sentences:
            if (len(elt) > 0):
                sentences.extend(sentence_to_wordlist(elt))

    words_weights = []
    for elt in sentences:
        if elt in vocab:
            words_weights.append(vocab_weights[elt])

    return np.array(words_weights)

In [84]:
def create_features(X):
    train = X.copy()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = []

    for text in train['text']:
        sentences += string_to_sentences(text, tokenizer)
        
    # Import the built-in logging module and configure it so that Word2Vec 
    # creates nice output messages
    #logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
     #                   level=logging.INFO)

    # Set values for various parameters
    num_features = 300    # Word vector dimensionality                      
    min_word_count = 40   # Minimum word count                        
    num_workers = 4       # Number of threads to run in parallel
    context = 10          # Context window size                                                                                    
    downsampling = 1e-3   # Downsample setting for frequent words

    # Initialize and train the model (this will take some time)
    print "Training model..."
    model = word2vec.Word2Vec(sentences, workers=num_workers,
                              size=num_features, min_count=min_word_count,
                              window=context, sample=downsampling)

    vocab = set(model.wv.vocab.keys())
    vocab_weights = model.wv

    train['text_list'] = train['text'].map(lambda x: string_to_words_weights(x, tokenizer, vocab, vocab_weights))
    mask = train['text_list'].map(len) > 0
    train_reduced = train.loc[mask, :].reset_index(drop=True)
    return train_reduced

In [85]:
# Use only 2 emotions first: happiness and sadness
df_happy = pd.read_csv('./data/happy.csv', encoding='utf-8')
df_sad = pd.read_csv('./data/sad.csv', encoding='utf-8')
df_all = pd.concat([df_happy, df_sad]).reset_index(drop=True)
train = create_features(df_all)

# Binarise emotions
emotion_dict = dict(zip(['happy', 'sad'], [1, 0]))
train['search_query'] =  train['search_query'].map(emotion_dict)

Training model...


## Random Fourier features

In [86]:
n_components = 100
random_seed = 8
rbf = RBFSampler(gamma=1, n_components=n_components, random_state=random_seed)
for i in range(n_components):
    train['rbf_feature_' + str(i)] = 0
rbf_columns = ['rbf_feature_' + str(i) for i in range(n_components)]
train[rbf_columns] = np.vstack(train['text_list'].map(lambda x: rbf.fit_transform(x).mean(axis=0)))

## Prediction

In [89]:
X_train, X_test, y_train, y_test = train_test_split(train[rbf_columns], train['search_query'], 
                                                    test_size=0.3, random_state=random_seed)
forest = RandomForestClassifier(max_depth=10)
forest.fit(X_train, y_train)
y_train_pred = forest.predict(X_train)
y_test_pred = forest.predict(X_test)
print('Train score: {0}'.format(accuracy_score(y_train_pred, y_train)))
print('Test score: {0}'.format(accuracy_score(y_test_pred, y_test)))

Train score: 0.859248708096
Test score: 0.749884062452


In [None]:
Train score: 0.965681727839
Test score: 0.744782810326

-> Compare with NLTK package