# The Big Bang Theory Exploratory Data Analysis

### Imports:

In [1]:
import json
import pandas as pd
import numpy as np
from datetime import datetime, timedelta, time

## Loading sample data file:

In [2]:
with open("data/DT_2/Raw/S1/The Big Bang_S0101.json", 'r') as file:
    s1e1Data = json.load(file)

In [4]:
# two different time formats in data, this cleans them into datetime objects
def string2datetime(string):
    try:
        return datetime.strptime(string.strip(), '%H:%M:%S:%f')

    except ValueError:
        try:
            return datetime.strptime(string.strip(), '%H:%M:%S,%f')
        
        except ValueError:
            return datetime.strptime(string.strip(), '%H:%M:%S')


In [5]:
# two different time formats in data, this cleans them into datetime objects
def datetime2String(date):
    # try:
    return datetime.strftime(date, '%H:%M:%S:%f')
    # 
    # except ValueError:
    #     try:
    #         return datetime.strptime(string.strip(), '%H:%M:%S,%f')
    #     
    #     except ValueError:
    #         return datetime.strptime(string.strip(), '%H:%M:%S')


In [6]:
def findHumor(file_path1, file_path2):
    humor_dict = {}
    for file_path in [file_path1, file_path2]:
        # print(file_path.upper())
        with open(file_path, 'r') as file:
            data = json.load(file)
            for dialog_num, info in data.items():
                if info['GT'] == 1:
                    # print("humor")
                    humor_start, humor_end = string2datetime(info["Humor Start Time"]), string2datetime(info["Humor End Time"])
                    duration = humor_end - humor_start
                    if timedelta(seconds=0) <= duration < timedelta(seconds=10):
                        humor_dict[humor_start] = humor_end
    return dict(sorted(humor_dict.items(), key=lambda x: x[0]))
    

In [7]:
def reorganize_data(file):
    lines_dict = {}
    for dialog_num, info in file.items():            
        dialog_turns = [key for key in info if key.startswith('Dialog Turn')]
        for turn in dialog_turns:
            start_time = string2datetime(info[turn]["Dialog Start time"])
            if start_time in lines_dict: 
                pass
            else:
                lines_dict[start_time] = {
                    "Scene": info["Scene"],
                    "Recipients": info[turn]["Recipients"],
                    "Speaker": info[turn]["Speaker"],
                    "Dialogue": info[turn]["Dialog"],
                    "Dialogue Start Time": string2datetime(info[turn]["Dialog Start time"]),
                    "Dialogue End Time": string2datetime(info[turn]["Dialog End time"])
                }

    return dict(sorted(lines_dict.items(), key=lambda x: x[0]))

            
        
        

In [8]:
path1 = "data/DT_2/Raw/S1/The Big Bang_S0101.json"
path2 = "data/DT_3/Raw/S1/The Big Bang_S0101.json"

humor_dict = findHumor(path1, path2)

In [9]:
lines_dict = reorganize_data(s1e1Data)

In [10]:
# TODO: set buffer
buffer = timedelta(seconds=1)

nHumorMissingLine = 0
idxsHumorMissingLine = []

humor_times = list(humor_dict.keys())
dialog_times = list(lines_dict.keys())
humorN = 0
dialogN = 0
while humorN < len(humor_dict):
    found_humor = False
    while dialogN < len(dialog_times) - 1 and not found_humor:
        laugh_start = humor_times[humorN]
        laugh_end = humor_dict[laugh_start]
        duration = laugh_end - laugh_start            
        
        current_dialog_start = dialog_times[dialogN]
        current_dialog_end = lines_dict[current_dialog_start]['Dialogue End Time']
        next_dialog_start = dialog_times[dialogN + 1]

        if current_dialog_start <= laugh_start < next_dialog_start and laugh_start <= (current_dialog_end + buffer):
            lines_dict[current_dialog_start]['isHumor'] = True
            lines_dict[current_dialog_start]['humorDuration'] = duration
            
            # print("Humor line: ", lines_dict[current_dialog_start]['Dialogue'])
            # print("Humor duration: ", humor_dict[laugh_start] - laugh_start)
            # print("Speaker: ", lines_dict[current_dialog_start]['Speaker'])
            # # print("\n")
            # print(f"Dialogue start: {lines_dict[current_dialog_start]['Dialogue Start Time']}")
            # print(f"Dialogue end: {lines_dict[current_dialog_start]['Dialogue End Time']}")
            # print(f"Humor start: {laugh_start}")
            # print("Humor end: ", humor_dict[laugh_start])
            # print("\n\n")
            found_humor = True
        dialogN += 1

    if not found_humor:
        dialogN = 0
        nHumorMissingLine += 1
        idxsHumorMissingLine.append(humorN)

    humorN += 1

In [11]:
# startTime and endTime must be in format String: "HH:MM:SS"
def getLinesBetween(startTime, endTime, lines_dict):
    
    return {
        lineStartTime: data
        for lineStartTime, data in lines_dict.items()
        if  string2datetime(startTime) <= lineStartTime <= string2datetime(endTime)}

## Instances where humor wasn't matched with a line 

In [13]:
print(f"Laughtrack missing line:\n\t{nHumorMissingLine} / {len(humor_times)} total lines")

Laughtrack missing line:
	15 / 140 total lines


In [54]:


def missingHumorInfo(missingHumorIndex):
    missingHumor = {}
    
    before_time = None
    after_time = None

    for index in missingHumorIndex:
        humorStart = humor_times[index]
        humorEnd = humor_dict[humorStart]

        for time_key in lines_dict.keys():
            if time_key < humorStart:
                
                before_time = time_key
            elif time_key > humorStart:
                after_time = time_key
                break

        missingHumor[humorStart] = {"Humor Start": humorStart,
                                    "Humor End": humorEnd,
                                    "Humor Duration":humorEnd-humorStart,
                                    "DialogEnd to HumorStart": humorStart - lines_dict[before_time]['Dialogue End Time'],
                                    "HumorEnd to NextDialogStart": lines_dict[after_time]['Dialogue Start Time'] - humorEnd
                                    }
        
    return missingHumor

In [55]:
missingHumorInfo(idxsHumorMissingLine)

{datetime.datetime(1900, 1, 1, 0, 0, 28): {'Humor Start': datetime.datetime(1900, 1, 1, 0, 0, 28),
  'Humor End': datetime.datetime(1900, 1, 1, 0, 0, 29),
  'Humor Duration': datetime.timedelta(seconds=1),
  'DialogEnd to HumorStart': datetime.timedelta(seconds=2, microseconds=940000),
  'HumorEnd to NextDialogStart': datetime.timedelta(microseconds=150000)},
 datetime.datetime(1900, 1, 1, 0, 0, 57): {'Humor Start': datetime.datetime(1900, 1, 1, 0, 0, 57),
  'Humor End': datetime.datetime(1900, 1, 1, 0, 0, 59),
  'Humor Duration': datetime.timedelta(seconds=2),
  'DialogEnd to HumorStart': datetime.timedelta(seconds=5, microseconds=820000),
  'HumorEnd to NextDialogStart': datetime.timedelta(microseconds=120000)},
 datetime.datetime(1900, 1, 1, 0, 2, 25): {'Humor Start': datetime.datetime(1900, 1, 1, 0, 2, 25),
  'Humor End': datetime.datetime(1900, 1, 1, 0, 2, 30),
  'Humor Duration': datetime.timedelta(seconds=5),
  'DialogEnd to HumorStart': datetime.timedelta(seconds=4, microsecond

In [42]:
humorStart - lines_dict[before_time]['Dialogue End Time']

datetime.timedelta(seconds=2, microseconds=940000)

In [43]:
lines_dict[after_time]['Dialogue Start Time'] - humorStart

datetime.timedelta(seconds=1, microseconds=150000)

In [44]:
lines_dict[before_time]

{'Scene': 'A corridor at a sperm bank.',
 'Recipients': ['Sheldon', 'Receptionist'],
 'Speaker': 'Leonard',
 'Dialogue': 'Excuse me.',
 'Dialogue Start Time': datetime.datetime(1900, 1, 1, 0, 0, 23, 140000),
 'Dialogue End Time': datetime.datetime(1900, 1, 1, 0, 0, 25, 60000)}

In [45]:
lines_dict[after_time]

{'Scene': 'A corridor at a sperm bank.',
 'Recipients': ['Sheldon', 'Receptionist'],
 'Speaker': 'Leonard',
 'Dialogue': 'One across is Aegean. Eight down is Nabokov. 26 across is MCM.',
 'Dialogue Start Time': datetime.datetime(1900, 1, 1, 0, 0, 29, 150000),
 'Dialogue End Time': datetime.datetime(1900, 1, 1, 0, 0, 35, 60000),
 'isHumor': True,
 'humorDuration': datetime.timedelta(seconds=1)}

In [20]:
missingHumorInfo = {}
for idx in idxsHumorMissingLine:
    start_time = humor_times[idx]
    end_time = humor_dict[start_time]
    duration = end_time - start_time
    
    missingHumorInfo[start_time]= {
                "End Humor": end_time,
                "Humor Duration": duration,
                "∆ Dialog-End and Laugh-Start": ,
                "∆ Laugh-End and Next Dialog-Start"
            }


# print(f"length: {len(idxsHumorMissingLine)}")

1900-01-01 00:00:28
1900-01-01 00:00:57
1900-01-01 00:02:25
1900-01-01 00:03:27
1900-01-01 00:07:02
1900-01-01 00:07:16
1900-01-01 00:11:05
1900-01-01 00:12:06
1900-01-01 00:15:21
1900-01-01 00:15:44
1900-01-01 00:17:14
1900-01-01 00:18:13
1900-01-01 00:18:32
1900-01-01 00:21:21
1900-01-01 00:22:19


In [15]:
def getLinesAroundHumor(humorStartTime, threshold, lines_dict):
    startTime = humorStartTime - threshold
    endTime = humorStartTime + threshold
    return getLinesBetween(startTime.strftime('%H:%M:%S:%f'), endTime.strftime('%H:%M:%S:%f'), lines_dict)
     

In [16]:
idx1 = idxsHumorMissingLine[0]
humorTime1 = humor_times[idx1]

surrounding_lines = getLinesAroundHumor(humorTime1, timedelta(seconds=6), lines_dict)

print(humorTime1)
display(surrounding_lines)

1900-01-01 00:00:28


{datetime.datetime(1900, 1, 1, 0, 0, 23, 140000): {'Scene': 'A corridor at a sperm bank.',
  'Recipients': ['Sheldon', 'Receptionist'],
  'Speaker': 'Leonard',
  'Dialogue': 'Excuse me.',
  'Dialogue Start Time': datetime.datetime(1900, 1, 1, 0, 0, 23, 140000),
  'Dialogue End Time': datetime.datetime(1900, 1, 1, 0, 0, 25, 60000)},
 datetime.datetime(1900, 1, 1, 0, 0, 29, 150000): {'Scene': 'A corridor at a sperm bank.',
  'Recipients': ['Sheldon', 'Receptionist'],
  'Speaker': 'Leonard',
  'Dialogue': 'One across is Aegean. Eight down is Nabokov. 26 across is MCM.',
  'Dialogue Start Time': datetime.datetime(1900, 1, 1, 0, 0, 29, 150000),
  'Dialogue End Time': datetime.datetime(1900, 1, 1, 0, 0, 35, 60000),
  'isHumor': True,
  'humorDuration': datetime.timedelta(seconds=1)}}

In [19]:
# lines_dict

In [18]:
idxsHumorMissingLine

[1, 7, 17, 24, 50, 52, 74, 78, 100, 104, 110, 117, 119, 135, 139]

In [None]:
for index, (key, value) in enumerate(humor_dict.items()):
    for i in idxsHumorMissingLine:
        print(index)