# The Big Bang Theory Exploratory Data Analysis

### Imports:

In [64]:
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time

## Loading sample data file:

In [65]:
with open("data/DT_2/Raw/S1/The Big Bang_S0101.json", 'r') as file:
    s1e1Data = json.load(file)

In [66]:
s1e1Data['Dialog 1']

{'Scene': 'A corridor at a sperm bank.',
 'Participant': ['Sheldon', 'Leonard'],
 'GT': 1,
 'AV_ID': 'The Big Bang_S0101',
 'Humor Start Time': '00:00:18',
 'Humor End Time': '00:00:20',
 'Dialog Turns 0': {'Recipients': ['Sheldon'],
  'Speaker': 'Leonard',
  'Dialog': "Agreed. What's your point?",
  'Dialog Start time': '00:00:12:18',
  'Dialog End time': '00:00:14:18'},
 'Dialog Turns 1': {'Recipients': ['Leonard'],
  'Speaker': 'Sheldon',
  'Dialog': "There's no point, I just think it's a good idea for a T-shirt.",
  'Dialog Start time': '00:00:14:23',
  'Dialog End time': '00:00:18:03'}}

In [69]:
# two different time formats in data, this cleans them into datetime objects
def string2datetime(string):
    try:
        return datetime.strptime(string.strip(), '%H:%M:%S:%f')

    except ValueError:
        try:
            return datetime.strptime(string.strip(), '%H:%M:%S,%f')
        
        except ValueError:
            return datetime.strptime(string.strip(), '%H:%M:%S')


In [124]:
# two different time formats in data, this cleans them into datetime objects
def datetime2String(date):
    # try:
    return datetime.strftime(date, '%H:%M:%S:%f')
    # 
    # except ValueError:
    #     try:
    #         return datetime.strptime(string.strip(), '%H:%M:%S,%f')
    #     
    #     except ValueError:
    #         return datetime.strptime(string.strip(), '%H:%M:%S')


In [125]:
def findHumor(file_path1, file_path2):
    humor_dict = {}
    for file_path in [file_path1, file_path2]:
        # print(file_path.upper())
        with open(file_path, 'r') as file:
            data = json.load(file)
            for dialog_num, info in data.items():
                if info['GT'] == 1:
                    # print("humor")
                    humor_start, humor_end = string2datetime(info["Humor Start Time"]), string2datetime(info["Humor End Time"])
                    # if humor_start in humor_dict:
                    #     print("already in")
                    #     if humor_dict[humor_start] != humor_end:
                    #         print("CHANGING END TIMEEEE")
                    # else:
                    #     print("not already in")
                    humor_dict[humor_start] = humor_end
    return dict(sorted(humor_dict.items(), key=lambda x: x[0]))
    

In [126]:
def reorganize_data(file):
    lines_dict = {}
    for dialog_num, info in file.items():            
        dialog_turns = [key for key in info if key.startswith('Dialog Turn')]
        for turn in dialog_turns:
            start_time = string2datetime(info[turn]["Dialog Start time"])
            if start_time in lines_dict: 
                pass
            else:
                lines_dict[start_time] = {
                    "Scene": info["Scene"],
                    "Recipients": info[turn]["Recipients"],
                    "Speaker": info[turn]["Speaker"],
                    "Dialogue": info[turn]["Dialog"],
                    "Dialogue Start Time": string2datetime(info[turn]["Dialog Start time"]),
                    "Dialogue End Time": string2datetime(info[turn]["Dialog End time"])
                }

    return dict(sorted(lines_dict.items(), key=lambda x: x[0]))

            
        
        

In [127]:
path1 = "data/DT_2/Raw/S1/The Big Bang_S0101.json"
path2 = "data/DT_3/Raw/S1/The Big Bang_S0101.json"

humor_dict = findHumor(path1, path2)

In [128]:
lines_dict = reorganize_data(s1e1Data)

In [129]:
# TODO: set buffer
buffer = timedelta(seconds=1)

nHumorMissingLine = 0
idxsHumorMissingLine = []

humor_times = list(humor_dict.keys())
dialog_times = list(lines_dict.keys())
humorN = 0
dialogN = 0
while humorN < len(humor_dict):
    found_humor = False
    while dialogN < len(dialog_times) - 1 and not found_humor:
        laugh_start = humor_times[humorN]
        current_dialog_start = dialog_times[dialogN]
        current_dialog_end = lines_dict[current_dialog_start]['Dialogue End Time']
        next_dialog_start = dialog_times[dialogN + 1]

        if current_dialog_start <= laugh_start < next_dialog_start and laugh_start <= (current_dialog_end + buffer):
            lines_dict[current_dialog_start]['isHumor'] = True
            lines_dict[current_dialog_start]['humorDuration'] = humor_dict[laugh_start] - laugh_start
            # print("Humor line: ", lines_dict[current_dialog_start]['Dialogue'])
            # print("Humor duration: ", humor_dict[laugh_start] - laugh_start)
            # print("Speaker: ", lines_dict[current_dialog_start]['Speaker'])
            # # print("\n")
            # print(f"Dialogue start: {lines_dict[current_dialog_start]['Dialogue Start Time']}")
            # print(f"Dialogue end: {lines_dict[current_dialog_start]['Dialogue End Time']}")
            # print(f"Humor start: {laugh_start}")
            # print("Humor end: ", humor_dict[laugh_start])
            # print("\n\n")
            found_humor = True
        dialogN += 1

    if not found_humor:
        dialogN = 0
        nHumorMissingLine += 1
        idxsHumorMissingLine.append(humorN)

    humorN += 1


In [130]:
# startTime and endTime must be in format String: "HH:MM:SS"
def getLinesBetween(startTime, endTime, lines_dict):
    
    return {
        lineStartTime: data
        for lineStartTime, data in lines_dict.items()
        if  string2datetime(startTime) <= lineStartTime <= string2datetime(endTime)}

## Instances where humor wasn't matched with a line 

In [131]:
for idx in idxsHumorMissingLine:
    print(humor_times[idx])
print(f"length: {len(idxsHumorMissingLine)}")

1900-01-01 00:00:28
1900-01-01 00:00:57
1900-01-01 00:02:25
1900-01-01 00:03:27
1900-01-01 00:07:02
1900-01-01 00:07:16
1900-01-01 00:08:20
1900-01-01 00:11:05
1900-01-01 00:12:06
1900-01-01 00:15:21
1900-01-01 00:15:44
1900-01-01 00:17:14
1900-01-01 00:18:13
1900-01-01 00:18:32
1900-01-01 00:21:21
1900-01-01 00:22:19
length: 16


In [138]:
def getLinesAroundHumor(humorStartTime, threshold, lines_dict):
    startTime = humorStartTime - threshold
    endTime = humorStartTime + threshold
    return getLinesBetween(startTime.strftime('%H:%M:%S:%f'), endTime.strftime('%H:%M:%S:%f'), lines_dict)
     

In [149]:
idx1 = idxsHumorMissingLine[0]
humorTime1 = humor_times[idx]

surrounding_lines = getLinesAroundHumor(humorTime1, timedelta(seconds=6), lines_dict)

surrounding_lines

{datetime.datetime(1900, 1, 1, 0, 22, 13, 60000): {'Scene': 'All five in Leonard’s car.',
  'Recipients': ['Howard', 'Raj', 'Penny', 'Leonard'],
  'Speaker': 'Sheldon',
  'Dialogue': "I don't know what your odds are in the world as a whole, but as far as the population of this car goes, you're a veritable mack daddy.",
  'Dialogue Start Time': datetime.datetime(1900, 1, 1, 0, 22, 13, 60000),
  'Dialogue End Time': datetime.datetime(1900, 1, 1, 0, 22, 19, 100000)}}

{}