# Avatar the Last Air Bender 
## *Sentiment Analysis of Characters, plot over time*

## Part Three: Process Sentiment Data to Create Data for Charts

We'll need the json module and a few helper functions for opening and saving json files

In [142]:
import json

# opens json file as python dictionary
def open_json_as_dict(path):
  with open(path) as f:
    data = json.load(f)
  return data

# dump a python dictionary object to a json file
def save_json(pyDict, path):
  with open(path, 'w') as f:
    json.dump(pyDict, f)

Make a function to get emotion totals from a list of NRC emotions.

In [143]:
def get_emotion_totals(emotions):
    emotionMap = {
      "anger": 0,
      "anticipation": 0,
      "disgust": 0,
      "fear": 0,
      "joy": 0,
      "negative": 0,
      "positive": 0,
      "sadness": 0,
      "surprise": 0,
      "trust": 0
      }
    for emotion in emotions:
        if emotion != 'none':
            emotionMap[emotion] += 1
    
    emotions = list(emotionMap.keys())
    emotionTotals = list(emotionMap.values())
    
    return {"emotions": emotions, "totals": emotionTotals}

Now we need a function to generate the per episode plots.
Three plots (sets of data) each:
* 1. [lines] x [vader compound scores]
* 2. [lines] x [emotions]
* 3. [emotions] x [num appearances]

In [144]:
def generate_plots(nrcEp, vaderEp):
    plots = []
    title = nrcEp['title']
    numEps = len(nrcEp['lines'])
    lines = [ ln for ln in range(0, numEps) ]
    vader = [ vaderEp['vaderLines'][i][1]['compound'] for i in range(0, numEps) ] 
    nrc = [ nrcEp['lines'][i][1] for i in range(0, numEps) ]
    e = get_emotion_totals(nrc)
    
    plots.append({"name": title + ' Vader Compound Scores By Line', "x": lines, "y": vader})
    plots.append({"name": title + ' NRC Emotions By Line', "x": lines, "y": nrc})
    plots.append({"name": title + ' NRC Emotion Totals', "x": e['emotions'], "y": e['totals']})
        
    return plots

We'll need a slightly different function to get charts for every character speaking more than 5 lines.
Though we will still generate the same three charts for each.

In [145]:
def generate_character_plots(nrcEp, vaderEp):
    plots = []
    characters = { }
    
    lines = nrcEp['lines']
    vaderLines = vaderEp['vaderLines']
    for i in range(0, len(lines)):
        char = lines[i][0]
        if characters.get(char) == None:
            characters[char] = {"lines": [i], "nrc": [lines[i][1]], "vader": [vaderLines[i][1]['compound']]}
        else:
            characters[char]['lines'].append(i)
            characters[char]['nrc'].append(lines[i][1])
            characters[char]['vader'].append(vaderLines[i][1]['compound'])
    
    for entry in characters.items():
        char = entry[0]
        data = entry[1]
        # if less than 5 lines, don't include character
        if len(data['lines']) < 5:
            break
        e = get_emotion_totals(data['nrc'])
        
        plots.append({"name": char + ' Vader Compound Scores By Line', "x": data['lines'], "y": data['vader']})
        plots.append({"name": char + ' NRC Emotions By Line', "x": data['lines'], "y": data['nrc']})
        plots.append({"name": char + ' NRC Emotion Totals', "x": e['emotions'], "y": e['totals']})
    
    return plots

Create a wrapper function to call the two plot generators and combine the results.

In [146]:
def get_plots_from_sentiment_data(nrcEp, vaderEp):
    plots = generate_plots(nrcEp, vaderEp)
    plots.extend(generate_character_plots(nrcEp, vaderEp))
    return plots

Now, we need a function to get the sentiment data for a book/season summary.

In [147]:
def get_end_of_book_page(index, title, nrc, vader):
    nrcL = []
    vaderL = []
    
    # if book 1
    if index == 22:
        for i in range(0,22):
            nrcL.extend(nrc[i]['lines'])
            vaderL.extend(vader[i]['vaderLines'])
    # book 2
    elif index == 42:
        for i in range(22,42):
            nrcL.extend(nrc[i]['lines'])
            vaderL.extend(vader[i]['vaderLines'])
    # book 3
    else:
        for i in range(42,len(nrc)):
            nrcL.extend(nrc[i]['lines'])
            vaderL.extend(vader[i]['vaderLines'])
    
    # combine all the lines for the book
    plots = get_plots_from_sentiment_data({ "title": title, "lines": nrcL }, { "title": title, "vaderLines": vaderL })

    return {"title": 'end-book', "bookTitle": title, "plots": plots}

And a function to get the sentiment data for the series in totality. 

In [148]:
def get_end_of_series_page(nrc, vader):
    nrcL = []
    vaderL = []
    title = 'end-series'
    
    for i in range(0, len(nrc)):
        nrcL.extend(nrc[i]['lines'])
        vaderL.extend(vader[i]['vaderLines'])
        
    plots = get_plots_from_sentiment_data({ "title": title, "lines": nrcL }, { "title": title, "vaderLines": vaderL })
    
    return {"title": title, "bookTitle": '', "plots": plots}

Define the final function needed. The one that calls them all. Loops through all the episodes and 
generates chart data for each.

In [149]:
def generate_chart_data(nrcData, vaderData):
    bookTitles = ['Book One: Water', 'Book Two: Earth', 'Book Three: Fire']
    bookStarts = ['The Avatar State', 'Escape from the Spirit World']
    book = 0
    pages = []
    totalEps = len(nrcData)
    for i in range(0, totalEps):
        title = nrcData[i]['title']
        # if we are starting a new Book
        if title in bookStarts and title != bookStarts[book - 1]:
            book += 1
            pages.append(get_end_of_book_page(i, bookTitles[book], nrcData, vaderData))
        bookTitle = bookTitles[book]
        plots = get_plots_from_sentiment_data(nrcData[i], vaderData[i])
        page = { "title": title, "bookTitle": bookTitle, "plots": plots }
        pages.append(page)
    
    # if this is the last episode
    if i == (totalEps - 1):
        pages.append(get_end_of_book_page(i, bookTitle, nrcData[i], vaderData[i]))
        pages.append(get_end_of_series_page(nrcData, vaderData))
    
    return pages

Okay now time to do the work.

Generate the pages and save chart data as a json file.

In [None]:
# get NRC and Vader data by episode
nrcEmotionData = open_json_as_dict('data/avatar-episodes-nrc-emotion.json')['episodes']
vaderScoreData = open_json_as_dict('data/avatar-episodes-vader-scores.json')['episodes']

# generate page data
pages = { "pages": generate_chart_data(nrcEmotionData, vaderScoreData) }

# save the data as json file
save_json(pages, 'data/avatar-chart-data.json')