In [53]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from cdptools import CDPInstance, configs

In [4]:
# Connect to CDP database
seattle = CDPInstance(configs.SEATTLE)

In [7]:
# Download a specific event, save the json
manifest = seattle.get_transcript_manifest()
found_event = manifest.loc[manifest.event_id == "bb35a74c-53db-40f7-9af0-c0b296c2696a"].iloc[0]
save_path = seattle.file_store.download_file(found_event.filename)

In [9]:
import json
with open(save_path, "r") as read_in:
    transcript = json.load(read_in)
    for s in transcript["data"][:1]:
        print(s)

{'speaker': '', 'data': [{'start_time': 10.977, 'end_time': 12.679, 'text': 'Well, good afternoon, everyone.'}, {'start_time': 12.679, 'end_time': 15.315, 'text': 'Thank you for being here.'}, {'start_time': 15.315, 'end_time': 22.589, 'text': 'Today is February 3RD, 2020, and the Seattle city council Committee, full committee of the Council will come to order.'}, {'start_time': 22.589, 'end_time': 24.124, 'text': 'It is 2:05 P.M.'}, {'start_time': 24.124, 'end_time': 26.659, 'text': "I'm Teresa Mosqueda, president pro Tem for today."}, {'start_time': 26.659, 'end_time': 27.761, 'text': 'Thank you all for joining us.'}, {'start_time': 27.761, 'end_time': 33.8, 'text': 'We do have a packed House up here and we have a packed House downstairs as well.'}, {'start_time': 33.8, 'end_time': 39.072, 'text': 'As is true in the past, we have allowed as many signs as you can see in the audience.'}, {'start_time': 39.072, 'end_time': 49.682, 'text': "If you have a large sign, we're asking folks to

In [161]:
transcript['data'][:3]

[{'speaker': '',
  'data': [{'start_time': 10.977,
    'end_time': 12.679,
    'text': 'Well, good afternoon, everyone.'},
   {'start_time': 12.679,
    'end_time': 15.315,
    'text': 'Thank you for being here.'},
   {'start_time': 15.315,
    'end_time': 22.589,
    'text': 'Today is February 3RD, 2020, and the Seattle city council Committee, full committee of the Council will come to order.'},
   {'start_time': 22.589, 'end_time': 24.124, 'text': 'It is 2:05 P.M.'},
   {'start_time': 24.124,
    'end_time': 26.659,
    'text': "I'm Teresa Mosqueda, president pro Tem for today."},
   {'start_time': 26.659,
    'end_time': 27.761,
    'text': 'Thank you all for joining us.'},
   {'start_time': 27.761,
    'end_time': 33.8,
    'text': 'We do have a packed House up here and we have a packed House downstairs as well.'},
   {'start_time': 33.8,
    'end_time': 39.072,
    'text': 'As is true in the past, we have allowed as many signs as you can see in the audience.'},
   {'start_time': 3

In [32]:
def get_text_from_transcript(transcript: dict) -> list:
    """Takes a transcript file and outputs a list of lists of the text, where each
    sub list is one speaker"""
    list_out = []
    for speaker in transcript:
        speaker_list = []
        for text in speaker['data']:
            speaker_list.append(text['text'])
        list_out.append(speaker_list)
    return list_out
#     transcript[0]['data']

In [34]:
text_list = get_text_from_transcript(transcript['data'])

In [109]:
def get_textBlob_score(sent):
    # Returns sentence with textblob score, this polarity score is between -1 to 1
    polarity = TextBlob(sent).sentiment.polarity
    return sent, polarity

def get_vader_score(sent):
    ss = sid.polarity_scores(sent)
    out_list = []
    for k in sorted(ss):
        out_list.append([k, ss[k]])
    return out_list

def get_vader_compound_score(sent):
    # returns just the compound sentiment VADER score
    ss = sid.polarity_scores(sent)
    out = ss['compound']
    return out

In [48]:
get_textBlob_score(text_list[3][0])

('The first three names are Botma, Rahm, and Chempad.', 0.25)

In [68]:
# Download vader lexicon, and instatiate a SentimentIntensityAnalyzer object
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer();

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tree/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [85]:
get_vader_score(text_list[7][0])

[['compound', 0.6486], ['neg', 0.0], ['neu', 0.361], ['pos', 0.639]]

In [106]:
get_vader_compound_score(text_list[7][0])

0.6486

In [75]:
transcript_sub = text_list[:3]

In [111]:
def vader_score_whole_transcript(transcript: list) -> list:
    """
    takes in a list of text from a transcript and scores each sentence with NLTK VADER,
    returns list of VADER Compund Score
    """
    vader_score_list = []
    for speaker in transcript:
        for text in speaker:
            vader_score_list.append([get_vader_compound_score(text), text])
    return vader_score_list

In [143]:
scores = vader_score_whole_transcript(text_list)

In [151]:
source = pd.DataFrame(scores, columns =['Score', 'Text'])

In [152]:
source['Sentence_No'] = source.index

In [153]:
nearest = alt.selection(type='single', nearest=True, on='mouseover',
                        fields=['x'], empty='none')

In [156]:
line = alt.Chart(source).mark_line(interpolate='basis').encode(
    x='Sentence_No:Q',
    y='Score:Q',
#     color='category:N'
)

In [157]:
selectors = alt.Chart(source).mark_point().encode(
    x='Sentence_No:Q',
    opacity=alt.value(0),
).add_selection(
    nearest
)

In [159]:
# Draw points on the line, and highlight based on selection
points = line.mark_point().encode(
    opacity=alt.condition(nearest, alt.value(1), alt.value(0))
)

# Draw text labels near the points, and highlight based on selection
text = line.mark_text(align='left', dx=5, dy=-5).encode(
    text=alt.condition(nearest, 'Score:Q', alt.value(' '))
)

# Draw a rule at the location of the selection
rules = alt.Chart(source).mark_rule(color='gray').encode(
    x='Sentence_No:Q',
).transform_filter(
    nearest
)

# Put the five layers into a chart and bind the data
alt.layer(
    line, selectors, points, rules, text
).properties(
    width=600, height=300
)

In [120]:
import altair as alt
import numpy as np
from vega_datasets import data