# Youtube experiment - Data preparation

### Import the libraries

In [3]:
# This Source Code Form is subject to the terms of the MIT
# License. If a copy of the same was not distributed with this
# file, You can obtain one at
# https://github.com/akhilpandey95/reproducibility/blob/master/LICENSE.

import urllib3
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import  matplotlib.pyplot as plt
from bs4 import BeautifulSoup as BS
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

### Function for downloading the transcripts

In [4]:
# download the transcripts for a video
def download_transcripts(video_id):
    http = urllib3.PoolManager()
    res = http.request('GET', 'http://video.google.com/timedtext?lang=en&v=' + str(video_id))
    return res.data.decode('utf-8')

### Data preparation

In [None]:
# read the dataset
rd = pd.read_csv('USvideos.csv')

# change booleans to binary
rd = rd.assign(comments_disabled = [1 if rd['comments_disabled'][each] == True else 0 for each in range(0, len(rd['comments_disabled']))])
rd = rd.assign(ratings_disabled = [1 if rd['ratings_disabled'][each] == True else 0 for each in range(0, len(rd['ratings_disabled']))])
rd = rd.assign(video_error_or_removed = [1 if rd['video_error_or_removed'][each] == True else 0 for each in range(0, len(rd['video_error_or_removed']))])

# add the transcript
transcripts = [download_transcripts(rd['video_id'][each]) for each in tqdm(range(len(rd)))]
rd = rd.assign(video_transcript_en = transcripts)

rd.head()

  1%|          | 167/21965 [03:19<7:15:03,  1.20s/it]

### Drop the rows which don't have transcripts

In [20]:
# we are going to remove all the NaN's from dataset
rd = rd[rd.video_transcript_en != '']

# reset the index
rd = rd.reset_index(drop=True)

# print few rows
rd.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,video_transcript_en
0,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,0,0,0,"One year after the presidential election, John...","<?xml version=""1.0"" encoding=""utf-8"" ?><transc..."
1,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,0,0,0,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,"<?xml version=""1.0"" encoding=""utf-8"" ?><transc..."
2,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,0,0,0,I know it's been a while since we did this sho...,"<?xml version=""1.0"" encoding=""utf-8"" ?><transc..."
3,gHZ1Qz0KiKM,17.14.11,2 Weeks with iPhone X,iJustine,28,2017-11-13T19:07:23.000Z,"ijustine|""week with iPhone X""|""iphone x""|""appl...",119180,9763,511,1434,https://i.ytimg.com/vi/gHZ1Qz0KiKM/default.jpg,0,0,0,Using the iPhone for the past two weeks -- her...,"<?xml version=""1.0"" encoding=""utf-8"" ?><transc..."
4,nc99ccSXST0,17.14.11,5 Ice Cream Gadgets put to the Test,CrazyRussianHacker,28,2017-11-12T21:50:37.000Z,"5 Ice Cream Gadgets|""Ice Cream""|""Cream Sandwic...",817732,23663,778,3432,https://i.ytimg.com/vi/nc99ccSXST0/default.jpg,0,0,0,Ice Cream Pint Combination Lock - http://amzn....,"<?xml version=""1.0"" encoding=""utf-8"" ?><transc..."


### Write to CSV

In [23]:
rd.to_csv('USvideos-with-transcripts.csv', encoding='utf-8')