## Basic Channel Info EDA

In [None]:
import pandas as pd
channel_data = pd.read_csv('../top_channel_data/top_1_25_Channel_info.csv', index_col = 0)

In [None]:
channel_data.head()

In [None]:
channel_data.dropna(inplace = True, subset = ['totalVideos'])

In [None]:
channel_data.to_csv('../top_channel_data/top_1_25_Channel_info.csv')

In [None]:
import sweetviz as sv
feature_config = sv.FeatureConfig(force_text=['playlistId','channelName'])
channel_info_report = sv.analyze(channel_data, target_feat='views',pairwise_analysis = 'on', feat_cfg=feature_config)
channel_info_report.show_notebook()

### Video Data EDA

In [None]:
import pandas as pd
from dateutil import parser
import seaborn as sns
import matplotlib.pyplot as plt
import isodate
import datetime


In [None]:
vid_data = pd.read_csv('../top_channel_data/top_1_25_vid_details.csv', index_col = 0)
viddf = vid_data.copy()

In [None]:
viddf.head()

In [None]:
sns.heatmap(viddf.isnull(),yticklabels = False, cbar = False, cmap = 'viridis')
plt.show()

In [None]:
# Drop Dislike count since it has many missing values
viddf.drop(['dislikeCount'],axis = 1, inplace=True)

In [None]:
# Create publish day (in the week) column
viddf['publishedAt'] =  viddf['publishedAt'].apply(lambda x: parser.parse(x))
viddf['publishDayName'] = viddf['publishedAt'].apply(lambda x: x.strftime("%A"))

# Convert publishedAt column to datetime
viddf['publishedAt'] =(pd.to_datetime(viddf['publishedAt']))


# Extract year, month, and time into separate columns
viddf['publishingYear'] = viddf['publishedAt'].dt.year
viddf['publishingMonth'] = viddf['publishedAt'].dt.month
viddf['publishingTime'] = viddf['publishedAt'].dt.time

# Get month name
viddf['publishingMonthName'] = viddf['publishedAt'].dt.strftime("%B")

# Dropping the published At column
viddf.drop(['publishedAt'],axis = 1, inplace = True)

In [None]:
# Doing some necessary data manipulations
cols = ['viewCount', 'likeCount', 'commentCount']
viddf[cols] = viddf[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [None]:
import numpy as np

In [None]:
#viewCount is basically a target feature and shouldn't have nan values to sucsessfully generate a report
viddf.dropna(axis = 0, subset = ['viewCount'], inplace = True) 
viddf.dropna(axis = 0, subset = ['likeCount'], inplace = True)

#### Enriching data

I want to enrich the data for further analyses, for example:
- convert video duration to seconds instead of the current default string format
- calculate number of tags for each video
- calculate comments and likes per 1000 view ratio
- calculate title character length

In [None]:
# convert duration to seconds
viddf['durationSecs'] = viddf['duration'].apply(lambda x: isodate.parse_duration(x))
viddf['durationSecs'] = viddf['durationSecs'].astype('timedelta64[s]')
viddf.drop(['duration'],axis = 1, inplace=True) # Remove duration since we got it in secs now

In [None]:
# Add number of tags
viddf['tagsstr'] = viddf.tags.apply(lambda x: 0 if x is None else str((x))) #tags were not in proper format so converting them to str
viddf['tagsCount'] = viddf.tagsstr.apply(lambda x: 0 if (x == 0 or x =='nan') else len(eval(x)))
viddf.drop(['tags'],axis = 1, inplace=True) # Remove tags since we got tagstr now

In [None]:
# Comments and likes per 1000 view ratio
viddf['likeRatio'] = viddf['likeCount']/ viddf['viewCount'] * 1000
viddf['commentRatio'] = viddf['commentCount']/ viddf['viewCount'] * 1000

In [None]:
# Title character length
viddf['titleLength'] = viddf['title'].apply(lambda x: len(x))

In [None]:
viddf.columns

In [None]:
viddf.describe()

In [None]:
import sweetviz as sv
feature_config = sv.FeatureConfig(skip ='likeRatio',force_text=['video_id','channelTitle','title','description', 'definition','publishDayName','publishingMonthName','tagsstr'])
channel_info_report = sv.analyze(viddf,target_feat='viewCount',feat_cfg=feature_config)
channel_info_report.show_notebook()

### Comment Data EDA