In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import os
import re
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
import time
from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")

In [2]:
def get_ngrams(text, n ):
    n_grams = ngrams(word_tokenize(text), n)
    return [ ' '.join(grams) for grams in n_grams]

In [3]:
def a(test_str):
    ret = ''
    skip1c = 0
    skip2c = 0
    for i in test_str:
        if i == '(':
            skip1c += 1
        elif i == ')' and skip1c > 0:
            skip1c -= 1
        elif skip1c == 0 and skip2c == 0:
            ret += i
    return ret

In [4]:
path = '../../Data/CorporaData'

In [5]:
files = []
for folder in os.listdir(path):
    if folder != '.DS_Store' and folder != '.ipynb_checkpoints':
        for file in os.listdir(os.path.join(path, folder)):
            if file != '.ipynb_checkpoints':
                files.append(('../../Data/CommercialData/{}/{}_{}.txt'.format(
                    file.split('.')[0], file.split('.')[0], file.split('.')[2]), os.path.join(path, os.path.join(folder, file))))

In [7]:
for file in files:
    # if os.path.exists(file[0]):
    #     # print(file[0].split('/')[-1])
    #     continue
    # else:
    start = time.time()
    df = pd.read_csv(file[1])
    df.columns = ['URL', 'Title', 'Text']
    # Clean html, musical notes ♪, time stamps (check consecutive minutes)
    df['Text'] = df['Text'].apply(lambda text: str(text))
    df['Text'] = df['Text'].apply(lambda text: text.replace('[[TIME.START]] ', '('))
    df['Text'] = df['Text'].apply(lambda text: text.replace(' [[TIME.END]]', ')'))
    df['Text'] = df['Text'].apply(lambda text: str(text).split('TOPICS: TOPIC FREQUENCY ')[0])
    df['Text'] = df['Text'].apply(lambda text: str(text)[str(text).find('[[TITLE.END]] ') + len('[[TITLE.END]] '):])
    df['Text'] = df['Text'].apply(lambda text: re.sub('<[^>]+>', '', text))
    df['Text'] = df['Text'].apply(lambda text: re.sub(r'[^\w\s]', '', text))
    df['Text'] = df['Text'].apply(lambda text: ' '.join([word for word in text.split() if word not in cachedStopWords]))
    df['Text'] = df['Text'].apply(lambda text: a(text)) 
    # Pull all text data into a single string for analysis
    combined_text = '. '.join([i for i in df['Text'][0:len(df)]])
    # Find commercials by ngram frequency
    tokens = get_ngrams(combined_text, 20)
    freq = Counter(tokens)
    # Go in and grab freq > 10
    commercials = [i for i,m in freq.items() if m > 10]
    print('Average number of segments in {} -- split by ♪: {}'.format(file[0].split('/')[4], round(len(combined_text.split('♪'))/len(df), 2)))
    with open('{}'.format(file[0]), 'w') as f:
        f.write('\n'.join(commercials))
    f.close()
    end = time.time()
    print('Time taken to find commercials in {}: {} minutes\n'.format(file[0].split('/')[4], round((end-start)/60, 2)))

Average number of segments in CNBC_2013.txt -- split by ♪: 0.0
Time taken to find commercials in CNBC_2013.txt: 1.55 minutes

Average number of segments in MSNBC_2013.txt -- split by ♪: 0.0
Time taken to find commercials in MSNBC_2013.txt: 2.31 minutes

Average number of segments in Bloomberg_2013.txt -- split by ♪: 0.0
Time taken to find commercials in Bloomberg_2013.txt: 0.16 minutes

Average number of segments in CNN_2013.txt -- split by ♪: 0.0
Time taken to find commercials in CNN_2013.txt: 2.82 minutes

Average number of segments in FBC_2013.txt -- split by ♪: 0.0
Time taken to find commercials in FBC_2013.txt: 1.71 minutes

Average number of segments in FOXNEWS_2013.txt -- split by ♪: 0.0
Time taken to find commercials in FOXNEWS_2013.txt: 3.57 minutes

Average number of segments in FBC_2014.txt -- split by ♪: 0.0
Time taken to find commercials in FBC_2014.txt: 1.59 minutes

Average number of segments in CNN_2014.txt -- split by ♪: 0.0
Time taken to find commercials in CNN_2014.t

**Condense Commercials (reduce overhead when searching)**

In [None]:
def condense_lines(lines):
    comm_list = []
    for index, line in enumerate(lines):
        if line == '':
            continue
        running_line = line.split()
        i, j = 0, 1
        while index+j < len(lines) and running_line[1+i:] == lines[index+j].split()[:19]:
            running_line.append(lines[index+j].split()[-1])
            lines[index+j] = ''
            i += 1
            j += 1
        comm_list.append(' '.join(running_line))
        count = 0
        for c in comm_list:
            if lines[index] in c:
                count += 1
        if count > 0:
            lines[index] = ''
    return comm_list

In [None]:
for file in files:
    with open(file[0], 'r') as f:
        lines = f.readlines()
    if len(lines) == 500:
        ret = condense_lines(lines)
        with open(file[0], 'w') as wf:
            wf.write('\n'.join(ret))

**Check If Time Stamps are Consecutive**

In [None]:
files = []
for folder in os.listdir(path):
    if folder != '.DS_Store' and folder != '.ipynb_checkpoints':
        for file in os.listdir(os.path.join(path, folder)):
            if file != '.ipynb_checkpoints':
                files.append(os.path.join(path, os.path.join(folder, file)))

In [None]:
networks = ['FOXNEWS', 'FBC', 'MSNBC', 'CNN', 'CNBC', 'Bloomberg']
files_by_network = {}
for n in networks:
    files_by_network[n] = []
    
for file in files:
    files_by_network[file.split('/')[4].split('.')[0]].append(file)

In [None]:
for key, value in files_by_network.items():
    for file in value:
        df = pd.read_csv(file)
        df.columns = ['URL', 'Title', 'Text']
        df = df.drop_duplicates(subset=['Title'])
        df.reset_index(inplace=True, drop=True)
        df['Text'] = df['Text'].apply(lambda text: str(text))
        df['Text'] = df['Text'].apply(lambda text: str(text).split('TOPICS: TOPIC FREQUENCY ')[0])
        df['Text'] = df['Text'].apply(lambda text: str(text)[str(text).find('[[TITLE.END]] ') + len('[[TITLE.END]] '):])
        df['Text'] = df['Text'].apply(lambda text: re.sub('<[^>]+>', '', text))
        
        list_of_times = {}
        for i in range(len(df.index)):
            list_of_times[df.loc[i, 'Title']] = []

        for i in range(len(df.index)):
            text = df.loc[i, 'Text'].split('[[TIME.START]]')
            for segment in text:
                if segment.split(' [[TIME.END]]')[0].split():
                    list_of_times[df.loc[i, 'Title']].append(segment.split(' [[TIME.END]]')[0].split())
                    
        for k,v in list_of_times.items():
            list_of_times[k] = [l[0] for l in v]

In [None]:
shows_of_interest = []
for key, value in list_of_times.items():
    for index, elem in enumerate(value):
        hour = elem.split(':')[0]
        minute = elem.split(':')[1]
        if index < len(value) - 1:
            next_hour = value[index + 1].split(':')[0]
            next_minute = value[index + 1].split(':')[1]
            if int(hour) != 12:
                if hour == next_hour:
                    if int(next_minute) != int(minute) + 1:
                        shows_of_interest.append((key, elem, value[index+1], 'minute skipped'))
                elif int(next_hour) == int(hour)+1:
                    if int(next_minute) != 0:
                        shows_of_interest.append((key, elem, value[index+1], 'minute skipped'))
                else:
                    shows_of_interest.append((key, elem, value[index+1], 'hour skipped'))
            else:
                if hour == next_hour:
                    if int(next_minute) != int(minute) + 1:
                        shows_of_interest.append((key, elem, value[index+1], 'minute skipped'))
                elif int(next_hour) == 1:
                    if int(next_minute) != 0:
                        shows_of_interest.append((key, elem, value[index+1], 'minute skipped'))
                else:
                    shows_of_interest.append((key, elem, value[index+1], 'hour skipped'))

In [None]:
shows_of_interest

In [None]:
ind = df.loc[df['Title'] == 'Squawk Box : CNBC : October 30; 2013 6:00am-9:01am EDT'].index[0]

In [None]:
faulty = df.loc[ind, 'Text'].split('[[TIME.START]]')

In [None]:
# len(faulty)

In [None]:
# faulty[170:]