# Exploratory Data Analysis

In [1]:
import numpy as np
import pandas as pd
import json
import sys
from PIL import Image
import re
import os
import random

## Pulling relevant data

In [2]:
with open('data.json', 'r') as f:
    data = json.load(f)

book_links = data['books']
shelf_links = data['shelves']
shelf_values = data['shelf_values']
len(shelf_values)

10000

## Exploring Word Possibilities

Drawing inspiration from Emotion Detection and Sentiment Analysis of Images (https://faculty.cc.gatech.edu/~hays/7476/projects/Aditi_Vasavi.pdf) breaking down emotional groups into the six categories defined by the famous psychologist Ekman: Happiness, Sadness, Fear, Disgust, Anger and Surprise

#### Trying to find synonyms using Natural Language Processing(NLTK) WordNet library

In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/briankim/nltk_data...


True

In [8]:
#Checking synonym for the word "travel"
from nltk.corpus import wordnet#Creating a list 
synonyms = []
for syn in wordnet.synsets("happy"):
    for lm in syn.lemmas():
             synonyms.append(lm.name())#adding into synonyms
print (set(synonyms))

{'well-chosen', 'happy', 'glad', 'felicitous'}


In [10]:
# Experimenting with getting a larger set
synonyms = []
word_set = ["happiness", "happy", "joy", "joyous", "delight", "cheer", "cheerful"]
for word_target in word_set:
    for syn in wordnet.synsets(word_target):
        for lm in syn.lemmas():
                 synonyms.append(lm.name())#adding into synonyms
print (set(synonyms))

{'cheerful', 'transport', 'felicity', 'joy', 'recreate', 'enchant', 'gladden', 'chirk_up', 'joyousness', 'enrapture', 'pep_up', 'jolly_up', 'felicitous', 'delectation', 'revel', 'urge', 'cheer', 'barrack', 'hearten', 'enthrall', 'pleasure', 'ravish', 'rejoice', 'sunniness', 'upbeat', 'enjoy', 'joyfulness', 'please', 'exhort', 'cheerfulness', 'pollyannaish', 'embolden', 'jolly_along', 'urge_on', 'happy', 'sunshine', 'root_on', 'happiness', 'well-chosen', 'cheer_up', 'glad', 'enthral', 'inspire', 'delight', 'joyous'}


In [13]:
sum(
    [
        1 for shelf in shelf_values if not set(shelf).isdisjoint(synonyms)
    ]
)

1966

In [None]:
word_sets = [
    ["happiness", "happy", "joy", "joyous", "delight", "cheer", "cheerful"],
    ["sadness", "sad", "unhappy", "unhappiness"]
]

#### Trying to get synonyms using the Thesauras API

In [14]:
import requests

In [16]:
word = "happiness"
response = requests.get("https://api.api-ninjas.com/v1/thesaurus?word=" + word)

response.text

'{"error": "Missing API Key."}'

#### Trying to get synonyms by scraping Thesaurus.com

In [18]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [19]:
thes_url = "https://www.thesaurus.com/browse/"

In [24]:
def get_synonyms(url):
    url = thes_url + word
    page = urlopen(url)
    html = page.read().decode("utf-8")
    soup = BeautifulSoup(html, "html.parser")

    return [item.text for item in soup.findAll("div", {"data-testid": "word-grid-container"})[0].findAll("a")]

In [106]:
super_words = [
#     ["happiness", "happy", "joy", "joyful", "fun", "cheer", "cheery"],
    ["happiness", "happy", "joy", "joyful", "cheer", "cheery"],
    ["sadness", "sad", "depress", "depressing", "tragic", "unhappy", "unhappiness"],
    ["fear", "afraid", "wary", "scary", "scare", "fearful", "intimidate", "intimidating"],
    ["disgust", "disgusting", "gross", "icky"],
    ["anger", "angry", "rage", "wrath", "wrathful"]
]

In [107]:
syn_sets = []

for super_word_set in super_words:
    print(super_word_set)
    syn_set = set(super_word_set)
    for word in super_word_set:
        synonyms = get_synonyms(thes_url + word)
        tmp_synonyms = []
        
        for i, synonym in enumerate(synonyms):
            sys.stdout.write("\r" + str(i + 1) + "/" + str(len(synonyms)))
            sys.stdout.flush()
            tmp_synonyms.extend(get_synonyms(thes_url + synonym))
        
        
        syn_set.update(synonyms)
        syn_set.update(tmp_synonyms)
#         print(" ", len(tmp_synonyms), len(set(tmp_synonyms)), len(syn_set), len(set(synonyms + tmp_synonyms)))
        print()

    print(len(syn_set))
    syn_sets.append(syn_set)

['happiness', 'happy', 'joy', 'joyful', 'cheer', 'cheery']
47/47
49/49
49/49
36/36
26/26
5/5
164
['sadness', 'sad', 'depress', 'depressing', 'tragic', 'unhappy', 'unhappiness']
43/43
49/49
56/56
21/21
43/43
45/45
9/9
224
['fear', 'afraid', 'wary', 'scary', 'scare', 'fearful', 'intimidate', 'intimidating']
49/49
44/44
37/37
14/14
7/7
49/49
40/40
6/6
231
['disgust', 'disgusting', 'gross', 'icky']
22/22
47/47
14/14
10/10
90
['anger', 'angry', 'rage', 'wrath', 'wrathful']
48/48
53/53
52/52
26/26
17/17
152


In [108]:
len(syn_sets)

5

In [109]:
[len(syn_set) for syn_set in syn_sets]

[164, 224, 231, 90, 152]

In [110]:
[
    sum(
        [
            1 for shelf in shelf_values if not set(shelf).isdisjoint(list(syn_set))
        ]
    ) for syn_set in syn_sets
]

[316, 2392, 852, 58, 38]

In [111]:
sum(
    [
        1 for shelf in shelf_values if any(
            [
                not set(shelf).isdisjoint(list(syn_set)) for syn_set in syn_sets
            ]
        )
    ]
)

3290

In [112]:
# Counting duplicate occurrences
# sum(
#     [
#         1 for shelf in shelf_values if sum(
#             [
#                 1 for syn_set in syn_sets if not set(shelf).isdisjoint(list(syn_set))
#             ]
#         ) > 1
#     ]
# )

multiples = []

for i in range(len(book_links)):
    shelf = shelf_values[i]
    multiple_value = []
    for j, syn_set in enumerate(syn_sets):
        overlap = next((shelf_val for shelf_val in shelf if shelf_val in syn_set), None)
        if overlap is not None:
            multiple_value.append([j, overlap])
    if len(multiple_value) > 0:
        multiples.append(multiple_value)
            
len(multiples)

3290

In [113]:
sum([1 for multiple in multiples if len(multiple) == 2]), sum([1 for multiple in multiples if len(multiple) == 3]), sum([1 for multiple in multiples if len(multiple) == 4])

(350, 8, 0)

In [114]:
combinations = {}

for multiple in multiples:
    string = ""
    for item in multiple:
        string += str(item[0])
    if string in combinations:
        combinations[string] += 1
    else:
        combinations[string] = 1

len(combinations)

16

In [115]:
combinations

{'1': 2062,
 '2': 568,
 '12': 251,
 '0': 257,
 '01': 50,
 '124': 2,
 '3': 23,
 '23': 16,
 '4': 22,
 '14': 9,
 '123': 6,
 '24': 3,
 '13': 12,
 '02': 6,
 '04': 2,
 '03': 1}

In [116]:
y_classes = []
vals = []

for shelf in shelf_values:
    val = next((shelf_val for shelf_val in shelf if any([shelf_val in syn_set for syn_set in syn_sets])), -1)
    vals.append(val)
    if val == -1:
        y_classes.append(val)
    else:
        y_classes.append(syn_sets.index(next((syn_set for syn_set in syn_sets if val in syn_set))))

In [117]:
len(y_classes)

10000

In [118]:
sum([1 for y_class in y_classes if y_class > -1])

3290

In [119]:
sum([1 for val in vals if val == 'fun'])

0

In [120]:
sum([1 for val in vals if val == 'sad'])

1790

In [121]:
data = []

for i, y_class in enumerate(y_classes):
    data.append([str(i) + ".jpg", y_class])

with open('data_classes.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)