# Scratch Pad for Group EDA Case Study

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Import our data files
df = pd.read_csv('ddi_youtube_casestudy/data/USvideos.csv')
df_cat = pd.read_json('ddi_youtube_casestudy/data/US_category_id.json')

In [4]:
# The json file holds a dictionary value that correlates to the category_id in our .csv, so we will extract those values
df_two = pd.DataFrame(df_cat['items'].apply(lambda x: x['snippet']['title']).reset_index())

In [5]:
# Prep the .json data to be merged into the .csv dataframe
df_two.rename(columns={'index': 'category_id'}, inplace=True)
df = pd.merge(df, df_two)

In [6]:
# Clean up by dropping unneeded columns and reorganizing the remaining columns logically
df = df.drop(['video_id', 'thumbnail_link'], axis=1)
df.rename(columns={'items': 'category'}, inplace=True)
df = df[['title', 'channel_title', 'category_id', 'category', 'tags', 'description', 'publish_time', 'trending_date', 'views', 'likes', 'dislikes', 'comment_count', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed']]

In [7]:
# Provide uniformity among our date and time fields
df.insert(loc=6, column='publish_date', value=pd.to_datetime(df['publish_time']).dt.date) # Create a column with just the publish day and place it in the proper spot in our dataframe
df['publish_date'] = pd.to_datetime(df['publish_date'])                                   # Putting dt.date brought it back in as an object, so ensure it is converted to datetime
df['publish_time'] = pd.to_datetime(df['publish_time']).dt.time                           # Convert this column to date time and extract the time
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')              # Convert this column to datetime in a normal YYYY-MM-DD format

In [None]:
# The description column contains null values, so we will annotate where no description was available
df['description'] = df['description'].fillna('No description available')
df.insert(loc=9, column='days_before_viral', value=df['trending_date']-df['publish_date']).astype(int)  # Create a column to show how many days it took for a video to go viral

In [None]:
# Adding columns to display total engagements and engagement rate per number of views
df.insert(loc=14, column='engagements', value=df['likes'] + df['dislikes'] + df['comment_count'])
df.insert(loc=15, column='engagement_rate', value= df['engagements'] / df['views'])