## Exploratory Analysis of Transcripts

<b> Author:</b> Miraya Gupta \
<b> Date: </b> 04/05

In [5]:
import pandas as pd
import csv
import json
import numpy as np
import os
#from sklearn.feature_extraction.text import CountVectorizer

# Load Data

In [6]:
def get_df_for_year(year):
    '''
    Take in the year and return a df with data from that year
    '''
    directory = 'data'
    filename = f'final_result_{year}.csv'
    path = os.path.join(directory, filename)
    df = pd.read_csv(path)
    return df

In [7]:
#calling function to get csv for each year
all_dfs = []
for yr in ['2020', '2021', '2022', '2023', '2024']:
    df = get_df_for_year(yr)
    all_dfs.append(df)

In [8]:
all_dfs[0].columns

Index(['Unnamed: 0', 'video_id', 'video_timestamp', 'video_duration',
       'video_locationcreated', 'suggested_words', 'video_diggcount',
       'video_sharecount', 'video_commentcount', 'video_playcount',
       'video_description', 'video_is_ad', 'video_stickers', 'author_username',
       'author_name', 'author_followercount', 'author_followingcount',
       'author_heartcount', 'author_videocount', 'author_diggcount',
       'author_verified', 'search_term', 'year', 'File Name', 'Content',
       'Subjectivity/Objectivity'],
      dtype='object')

In [9]:
#concatenate all
allDocs = pd.concat(all_dfs)

In [16]:
allDocs.dropna(subset=['Content'], inplace=True)

In [17]:
allDocs.shape

(11167, 30)

In [19]:
allDocs.set_index('video_id', inplace=True)

KeyError: "None of ['video_id'] are in the columns"

## Calculating Metrics

In [20]:
# finding average length of the transcripts
# video duration
# suggested words? 
allDocs['Content'].apply(len)

video_id
7104707494194269482      26
7167954365922151726     432
7277689387939417386     913
7176634327122365738     325
7212690611852578094     722
                       ... 
7239030289022979371    1493
7327480752940059950     543
7293650230640168238     852
7293103212528241966     991
7358119678851468586      21
Name: Content, Length: 11167, dtype: int64

In [21]:
def average_length(row):
    return sum(len(word) for word in row.split())

allDocs['avg_length'] = allDocs['Content'].apply(average_length)

In [22]:
allDocs.columns

Index(['Unnamed: 0', 'video_timestamp', 'video_duration',
       'video_locationcreated', 'suggested_words', 'video_diggcount',
       'video_sharecount', 'video_commentcount', 'video_playcount',
       'video_description', 'video_is_ad', 'video_stickers', 'author_username',
       'author_name', 'author_followercount', 'author_followingcount',
       'author_heartcount', 'author_videocount', 'author_diggcount',
       'author_verified', 'search_term', 'year', 'File Name', 'Content',
       'Subjectivity/Objectivity', 'transcript_File Name',
       'transcript_Content', 'transcript_Unnamed: 0',
       'transcript_Subjectivity/Objectivity', 'transcript_video_id',
       'avg_length'],
      dtype='object')

In [23]:
allDocs['avg_length'].mean()

901.5038954061073

In [29]:
allDocs['avg_length'].min()

1

In [28]:
allDocs['avg_length'].max()

11272