<h1><center> Data Scraping </center></h1>

<img src=https://miro.medium.com/max/1190/1*2dK2BfTULH9Sjk30A61Siw.jpeg style="width: 600px;"/>

In [47]:
import json
import re
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
import pickle
import pandas as pd

import google.oauth2.credentials
 
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from google_auth_oauthlib.flow import InstalledAppFlow

In [48]:
# The CLIENT_SECRETS_FILE stores filename that contains OAuth 2.0 information for youtube app with client_id and client_secret.

CLIENT_SECRETS_FILE = "client_secret.json"

# OAuth 2.0 allows for read/write access to the authenticated user's account and requires requests to use an SSL connection.
SCOPES = ['https://www.googleapis.com/auth/youtube.force-ssl']
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

In [49]:
#Setup

DEVELOPER_KEY= ""

youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,developerKey=DEVELOPER_KEY)

In [50]:
# FUNCTION 1: 
# Performs youtube search query. Stores  urls in a file
# Returns videoIds of Urls as a list

def get_videolinks_youtube_search(textToSearch,filename):
    query = urllib.parse.quote(textToSearch)
    url = "https://www.youtube.com/results?search_query=" + query
    response = urllib.request.urlopen(url)
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    videoIds=[]
    f = open(filename+".csv", "w")
    for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
        video_id = vid['href'].replace('/watch?v=','').partition('&')[0]
        actual_url = 'https://youtu.be/' + video_id
        videoIds.append((video_id))
        f.write(actual_url+'\n')
    print(videoIds)
    f.close()
    
    return videoIds

In [51]:
# FUNCTION 2:
# Reads list of video urls from a file
# Returns videoIds of the urls

def get_videoIds(filename):
    class Helper:
        def __init__(self):
            pass
        def id_from_url(self,url:str):
            return url.rsplit("/",1)[1] 

    with open(filename, "r") as f:
        content = f.readlines()

    content = list(map(lambda s: s.strip(), content))
    content = list(map(lambda s: s.strip(','), content))

    helper = Helper()
    videoIds = []
    for youtube_url in content:
        video_id = helper.id_from_url(youtube_url)
        videoIds.append(str(video_id))
        
    return videoIds

In [52]:
# FUNCTION 3:
# Returns general description of each videos

def get_video_info(videoIds):
    videoInfoJson = youtube.videos().list(id=videoIds, part='snippet').execute()
    return videoInfoJson


# FUNCTION 4:
# Returns statistics of each videos

def get_video_stat(videoId):
    videoStatisticsJson = youtube.videos().list(part='statistics, snippet', id=videoId).execute() 
    return videoStatisticsJson

In [53]:
# FUNCTION 5:
# Returns comment json of each videos

def get_comments(video_id):
    comments_json = youtube.commentThreads().list(
        part = "snippet",
        videoId = video_id,
        textFormat = "plainText",
        maxResults = 100
    ).execute()
    return comments_json

# FUNCTION 6:
# Returns reply json of each comment

def get_replies(comment_id):
    replies_json = youtube.comments().list(
        part = "snippet",
        parentId = comment_id,
        textFormat = "plainText",
    ).execute()
    return replies_json

In [54]:
# FUNCTION 7:
# Reads comment json (100 items) 
# Returns information of comments n replies 

def get_comment_reply_info(commentsJson):
# 	commentsJson = get_comments(videoId)
	rowList=[]
	for item in commentsJson["items"]:
		commentorChannelId = item['snippet']['topLevelComment']['snippet']['authorChannelId']['value']
		commentor = item['snippet']['topLevelComment']['snippet']['authorDisplayName']
		commentText = item['snippet']['topLevelComment']['snippet']['textDisplay']
		commentLikes = item['snippet']['topLevelComment']['snippet']['likeCount']

		reply_count = item['snippet']['totalReplyCount']
		comment_id = item['snippet']['topLevelComment']['id']

		row=[commentorChannelId,commentor,commentText,commentLikes]
		rowList.append(row)

		if (reply_count>0): #Getting replies 
			repliesJson = get_replies(comment_id)
			for item in repliesJson['items']:
				replierChannelId = item['snippet']['authorChannelId']['value']
				replier = item['snippet']['authorDisplayName']
				replyText = item['snippet']['textDisplay']
				replyLikes = item['snippet']['likeCount']
				reply_row=[replierChannelId,replier,replyText,replyLikes]
				rowList.append(reply_row)
	df = pd.DataFrame(rowList)
	df
	return df

In [55]:
# FUNCTION 8:
# Reads video Id 
# Returns all information of comments n replies in dataframe format 

def get_comment_reply_info_from_all_pages(video_id):
	ww = get_comments(video_id)
    
	#Get first page comment reply info
	df = get_comment_reply_info(ww)
	df_all3 = pd.DataFrame()
	df_all3 = df_all3.append(df, ignore_index=True, sort=False)
    
	#Keep getting comments from the following pages
	pageCount=1
	while ('nextPageToken' in ww) and (pageCount<=2): # Getting comments from three pages
		ww = youtube.commentThreads().list(
		part="snippet",
		maxResults=100, 
		pageToken = ww['nextPageToken'],
		videoId=video_id).execute()
		df2  = get_comment_reply_info(ww)
		df_all3 = df_all3.append(df2, ignore_index=True, sort=False)
		pageCount=pageCount+1
        
	header = ['CommentorChannelId', 'Commentor', 'Comments', 'CommentLikes']
	df_all3.columns = header
	return df_all3

In [56]:
# FUNCTION 9:
# Reads video Id 
# Returns all comments n replies in a single text format

def get_comments_replies_as_string(video_id):
    allCommentReplies =[]
    df = get_comment_reply_info_from_all_pages(video_id)
    for i in range(len(df['Comments'])):
        allCommentReplies.append(df['Comments'][i])
    return ' '.join(allCommentReplies)

In [57]:
# FUNCTION 10:
# Read videoIds
# display video info, stats, comment as one string in dataframe

def get_video_info_stats_comments(videoIds):
    rowList=[]

    #Getting basic video information 
    videoInfo=get_video_info(', '.join(videoIds))

    for item in videoInfo['items']:
        uploader = item['snippet']['channelTitle']
        videoId = item['id']
        title = item['snippet']['title']
        description = item['snippet']['description']   

        #Getting video statistics
        videoStatistics = get_video_stat(videoId)
        
        viewCount = videoStatistics['items'][0]['statistics']['viewCount']
        likeCount = videoStatistics['items'][0]['statistics']['likeCount']
        dislikeCount = videoStatistics['items'][0]['statistics']['dislikeCount']
        commentCount = videoStatistics['items'][0]['statistics']['commentCount']

        #Getting all comments and replies from a video
        commentText = get_comments_replies_as_string(videoId)

        row = [uploader, videoId, title, description,viewCount, likeCount, dislikeCount, commentCount, commentText]
        rowList.append(row)
    
    header = ['Uploader', 'VideoId', 'VideoTitle',
              'Description','View Count', 'Likes',
              'Dislikes', 'Comment Count','Comments']
    df = pd.DataFrame(rowList, columns = header)
    
    return df

**Getting Videos**

Let us read URLs from our video collection and get videoIds.

In [58]:
# Getting videos
link_file = "ppl_bullied.csv"
videoIds = get_videoIds(link_file)

print(*videoIds[:5])

2mg3sFuiwRw Wlub9KOJBt4 D2iCOMoOkyI _Uxw2X0hNGg -9BfaW69LSk


**Getting Information of Videos**

For this project I would like to have information of uploader or video subject, title and description of the video, count, likes, dislikes, total comments of the video. 

I would also like to have ~~all~~ three-page comments retrieved per video as a single text so later on I can analyze what kind of comment each uploader is getting. (I choose to have three-page comments since there is limit on api data retrieval from YouTube)

In [65]:
#Getting video information
vidInfo = get_video_info_stats_comments(videoIds)
vidInfo.head()

Unnamed: 0,Uploader,VideoId,VideoTitle,Description,View Count,Likes,Dislikes,Comment Count,Comments
0,Emily Ann Shaheen,2mg3sFuiwRw,HOW TO BE AN ARAB GIRL!,Heyyy guys! Thank you so much for watching! Th...,80408,1971,147,640,Im Arab from iraq but my golden name is in eng...
1,Emily Ann Shaheen,Wlub9KOJBt4,ARAB GIRL STEREOTYPES!,"Thanks for watching babes! xx, Emily Ann Shahe...",27925,608,46,240,Being an Arab is great and proud also true we ...
2,nowthisisliving,D2iCOMoOkyI,LESBIAN INTERVIEWS EX BOYFRIEND,"please be kind in the comments, this boy is th...",2098024,51987,1644,2718,My ex broke up with me because she wanted to b...
3,nowthisisliving,_Uxw2X0hNGg,why we broke up,I know this is a tough video for everyone. We ...,3081184,74616,1766,9406,still high key want them to get back together ...
4,Madison Beer,-9BfaW69LSk,Madison Beer- Catch Me Cover,HEY YOUTUBE!!!!!!!!! LONG TIME NO VIDEO! so so...,920561,14418,2408,2747,Be strong That's my fav song of demi lovato I'...


In [63]:
## Pickling the video Information and Comments
vidInfo.to_pickle("videoInfo.pkl")

**Getting Comment and Replies:**

I would also like to analyze commenters and their comments in next iteration of this project. So, I would like to get comments and replies data like Commenter, comment likes, comment text etc.

For each video, I would like to save the comment/reply data in a separate file.

In [64]:
#Getting comment/reply video
filepath = "Comments"

for item in videoIds:
    commentInfo=get_comment_reply_info_from_all_pages(item)
    
    ##pickle each of comment info from a video into its own indivdual file
    filename = str(item)+"_commentInfo.pkl"
    commentInfo.to_pickle(os.path.join(filepath, filename))

In [45]:
#Checking last video's comment information
commentInfo.head()

Unnamed: 0,CommentorChannelId,Commentor,Comments,CommentLikes
0,UCXkTts-5OgoyUgY4sOuWZOg,Phoenix Masami,"Liar I knew it hey no he said and I quote ""Go ...",0
1,UCjpS5M1bpmS6MnCkswYpZ7Q,who knows?,Too bad this rat tailed product of trailer tra...,0
2,UCsflmw50EadncHYtbpGY1xA,Samuelrey 1213,The worst thing you can do to someone:say some...,1
3,UCseM3rY7y7Iv5ieaqZfTHjQ,RazorbladeRomance,"""He abused me first"" \n\nOh shut up...we all k...",0
4,UCK2JiKKdNrIkD7qituzCqsw,ツ IᑕᕼᗷIᑎᕮIᑎᕼᗩᑎS,He earned another bodyslam for lying,0
