In [None]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from cdptools import CDPInstance, configs
import altair as alt
import numpy as np
import json

In [None]:
def transcript_to_dataframe(transcript):
    # Create rows container and counter
    rows = []
    current_overall_sentence_index = 0
    
    # Iterate through transcripts
    for i, speaker_block in enumerate(transcript["data"]):
        for j, sentence in enumerate(speaker_block["data"]):
            rows.append({
                "speaker_block_index": i,
                "speaker_sentence_index": j,
                "overall_sentence_index": current_overall_sentence_index,
                **sentence
            })
            current_overall_sentence_index += 1
            
    return pd.DataFrame(rows)

In [None]:
# Download vader lexicon, and instatiate a SentimentIntensityAnalyzer object
nltk.download("vader_lexicon")
sid = SentimentIntensityAnalyzer();

In [None]:
def get_vader_compound_score(sent):
    # returns just the compound sentiment VADER score
    ss = sid.polarity_scores(sent)
    out = ss['compound']
    return out

In [None]:
def vader_score_whole_transcript(transcript):
    """
    takes in a list of text from a transcript and scores each sentence with NLTK VADER,
    returns list of VADER Compund Score
    """
    scored = []
    for i, row in transcript.iterrows():
        row = dict(row)
        scored.append({
            **row,
            "score": get_vader_compound_score(row["text"])
        })

    return pd.DataFrame(scored)

In [None]:
# Connect to CDP database
seattle = CDPInstance(configs.SEATTLE)

In [None]:
# Download a specific event, save the json
manifest = seattle.get_transcript_manifest()
manifest = manifest.loc[manifest.confidence == 0.97]

# Iter generate plots
for i, row in manifest.iterrows():
    save_path = seattle.file_store.download_file(row.filename, overwrite=True)

    # Open and read transcript
    with open(save_path, "r") as read_in:
        raw_transcript = json.load(read_in)
    
    # Convert to dataframe
    transcript = transcript_to_dataframe(raw_transcript)
    # Generate scores for each sentence
    transcript = vader_score_whole_transcript(transcript)

    # Generate speaker blocks
    speaker_blocks = transcript.groupby("speaker_block_index")
    speaker_block_averaged_rows = []
    for speaker_block_index, row_indicies in speaker_blocks.groups.items():
        # Get rows
        speaker_rows = transcript.loc[row_indicies]

        # Create speaker block row
        speaker_block_averaged_rows.append({
            "speaker_block_index": speaker_block_index,
            "start_time": transcript.loc[row_indicies[0]].start_time,
            "end_time": transcript.loc[row_indicies[0]].end_time,
            "average_score": speaker_rows.score.mean(),
            "median_score": speaker_rows.score.median(),
        })

    speaker_block_averaged_rows = pd.DataFrame(speaker_block_averaged_rows)
    
    # Generate chart
    alt.Chart(speaker_block_averaged_rows).mark_line(interpolate="basis").encode(
        x="speaker_block_index",
        y="average_score",   
    ).save(f"{row.event_id}.png")