# Pre-procesing 

In [988]:
from random import choice
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pandas as pd
import json
import dateutil.parser
import os

In [989]:
#Datos de kaggle
RAW_DATA = Path("raw_dataset").absolute()

#Output de datos (datos limpios)

CLEAN_DATA = Path("clean_dataset").absolute()

if  CLEAN_DATA.exists():
    rmtree(CLEAN_DATA)
    os.mkdir(CLEAN_DATA)
else:
    os.mkdir(CLEAN_DATA)

FRAC = 0.1

## Loading the data

In [990]:

countries = ["CA","GB","US"] #Lista de paiess.
videos = {} #Lista con los dataframes de archivos csv
categories = {} # Lista con los diccionarios de los archivos json.
print("Abriendo archivos del dataset-->")
for i,country in enumerate(countries):
    print(f"\tPais: {country}  {i+1}/{len(countries)} ",end = "\r")
    file_csv = RAW_DATA.joinpath(f"{country}videos.csv")
    videos[country] = pd.read_csv(file_csv,encoding="ISO-8859-1",lineterminator="\n")
    videos[country].columns = [x.strip() for x in videos[country].columns]


    with open(RAW_DATA.joinpath(f"{country}_category_id.json")) as file:
        items = json.load(file)["items"]
        temp = {int(x["id"]): x for x in items} #La llave de cada categoria es su id.
        val = defaultdict(lambda: "NULL")
        val["snippet"] = defaultdict(lambda:"NULL")
        val["snippet"]["title"] = 'No categoria'
        categories[country] = defaultdict(lambda : val,temp)
print("\nDatos Cargados!")

Abriendo archivos del dataset-->
	Pais: US  3/3 
Datos Cargados!


In [991]:
# veamos el numero de nulos sin contar la descripcion de cada DF
print("Numero de filas con algun nulo (sin contar descripcion) en el DF de ")
for key,df_clean in videos.items():
    df = df_clean.copy().drop(["description"],axis = 1)
    df1 = df[df.isna().any(axis=1)]
    print(f"\tPais --> {key}: {len(df1)}")


Numero de filas con algun nulo (sin contar descripcion) en el DF de 
	Pais --> CA: 0
	Pais --> GB: 0
	Pais --> US: 0


In [992]:
f = '%Y-%m-%dT%H:%M:%S'

def get_category_name(series,key):
    return series.apply(lambda x: categories[key][x]["snippet"]["title"])

def get_publish_timestamp(series):
    return series.apply(lambda x: dateutil.parser.isoparse(x).strftime(f))

def get_trending_timestamp(series):
    return series.apply(lambda x: datetime.strptime(x,"%y.%d.%m").strftime(f))

In [993]:
for key,df in videos.items():
    df["category"] = get_category_name(df["category_id"],key) # Agregamos el nombre de la categoria
    # Parseamos las fechas para llegar y comparar como timestamps
    df["publish_timestamp"] = get_publish_timestamp(df["publish_time"]) 
    df["trending_timestamp"] = get_trending_timestamp(df["trending_date"]) 
    df.drop(["video_id","category_id","thumbnail_link"],axis =1,inplace=True)

In [994]:
print("Escribiendo nuevo dataset-->")
df_all = pd.DataFrame(columns = ["country"]+ list(videos["CA"].columns))
for i,(key,df) in enumerate(videos.items()):
    df["country"] = key
    print(f"Escribiendo {key}videos.csv   {i+1}/{len(videos.items())}",end = "\r")
    # get 10% sample of df
    # remove rows with no tags
    df = df[df["tags"] != "[none]"]
    df = df.sample(frac=0.1)
    df_all = pd.concat([df_all,df],ignore_index = True)
# Add numeric id to each row called video_id
df_all["video_id"] = df_all.index

print("\n Dataset listo!")

Escribiendo nuevo dataset-->
Escribiendo USvideos.csv   3/3
 Dataset listo!


In [995]:
df_all.head()

Unnamed: 0,country,trending_date,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,category,publish_timestamp,trending_timestamp,video_id
0,CA,17.26.11,KUCKIAN COSMETICS LAUNCH!!!!,John Kuckian,2017-11-25T21:00:00.000Z,"john|""kuckian""|""british""|""youtuber""|""drama""|""j...",76935,5798,1267,3527,False,False,False,http://kuckian.co\n\nâ¶ï¸ Snapchat: JohnKuck...,News & Politics,2017-11-25T21:00:00,2017-11-26T00:00:00,0
1,CA,18.17.04,Michael Cohenâs Mystery Client Is Sean Hanni...,MSNBC,2018-04-17T01:15:26.000Z,"Hardball|""Hardball with Chris Matthews""|""Chris...",117294,1116,84,651,False,False,False,Hannity never disclosed his relationship with ...,News & Politics,2018-04-17T01:15:26,2018-04-17T00:00:00,1
2,CA,18.14.02,what working tech support is REALLY like,Alex Meyers,2018-02-13T12:00:06.000Z,"alex meyers|""jaiden""|""animations""|""jaidenanima...",109050,10963,99,708,False,False,False,what working tech support is really like anima...,Entertainment,2018-02-13T12:00:06,2018-02-14T00:00:00,2
3,CA,18.11.05,GUESS THAT FOOD CHALLENGE! #2 | People Vs. Food,REACT,2018-05-10T19:00:02.000Z,"Guess That Food|""Cauliflower""|""Cabbage""|""GUESS...",149259,5164,95,834,False,False,False,Staff try to Guess that food! Click to get Ama...,Entertainment,2018-05-10T19:00:02,2018-05-11T00:00:00,3
4,CA,18.08.01,BuzzFeed Men Read Thirsty Comments,BuzzFeedVideo,2018-01-06T16:00:22.000Z,"Buzzfeed|""Hot""|""Sexy""|""Hot Guys""|""BuzzFeedVide...",931841,31652,479,2911,False,False,False,Our audience isn't properly hydrated.\n\nCheck...,People & Blogs,2018-01-06T16:00:22,2018-01-08T00:00:00,4


# New Tables for tags

In [996]:
tags_series = df_all["tags"].apply(lambda x: x.split("|"))
tags_series = tags_series.apply(lambda x: [y.strip() for y in x])
tags = [item for sublist in tags_series for item in sublist] # Lista con todas las tags.
new_tags = []
for tag in tags:
    # remove " from tags
    tag = tag.replace('"',"")
    # Replace spaces with _
    tag = tag.replace(" ","_")
    new_tags.append(tag)

tags = new_tags


# Count frequency of each tag
freq_dict = {}
for tag in tags:
    if tag in freq_dict:
        freq_dict[tag] += 1
    else:
        freq_dict[tag] = 1

# Remove tags that appear less than 100 times

thresh = 100

freq_dict = {key:value for key,value in freq_dict.items() if value >= thresh}
tag_set = freq_dict.keys()


# Remove from every elemnt in tag series values that are not in final_set
def remove_tags(x):
    res = []
    for elem in x:
        elem  = elem.replace('"',"")
        # Replace spaces with _
        elem = elem.replace(" ","_")
        if elem in tag_set:
            res.append(elem)

    return res

tags_series = tags_series.apply(remove_tags)



# print info

print("\n------------BEFORE------------\n")
print(f"Number of unique tags : {len(set(new_tags))}")
print(f"Total number of tags: {len(new_tags)}")
print("\n------------AFTER------------\n")

print(f"Number of unique tags : {len(tag_set)}")
print("Total number of tags.",sum(freq_dict.values()))
print("Rows without tags {}".format(len(tags_series[tags_series.apply(lambda x: len(x) == 0)])))



------------BEFORE------------

Number of unique tags : 239173
Total number of tags: 648040

------------AFTER------------

Number of unique tags : 207
Total number of tags. 42565
Rows without tags 18052


In [997]:
# create new dataframe with 2 columns tag_id and tag_name
tag_df = pd.DataFrame(columns = ["tag_id","tag_name"],index=  range(len(tag_set)))
# create enw dataframe with 2 columns video_id and tag_id
video_tag_df = pd.DataFrame(columns = ["video_id","tag_id"],index = range(sum(freq_dict.values())))

In [998]:
id_dict = {}
for i,elem in enumerate(tag_set):
    tag_df.loc[i] = [i,elem]
    id_dict[elem] = i
counter = 0
for elem in enumerate(tags_series):
    L = elem[1]
    for tag in L:
        video_tag_df.loc[counter] = [elem[0],id_dict[tag]]
        counter += 1


In [999]:
df_all = df_all.drop(["tags","description"],axis = 1)

In [1000]:
video_tag_df

Unnamed: 0,video_id,tag_name
0,0,youtuber
1,0,drama
2,0,fun
3,0,beauty
4,1,Donald_Trump
...,...,...
42560,33819,technology
42561,33820,Funny
42562,33822,BuzzFeed
42563,33824,review


In [1001]:
# show uniqute values of category
print("Unique values of category")



# result: http://www.oschina.net/search?scope=bbs&q=C%E8%AF%AD%E8%A8%80
quote('http://www.oschina.net/search?scope=bbs&q=C语言',safe='/:?=&')
# replace " " with "_"
df_all["category"] = df_all["category"].apply(lambda x: x.replace(" ","_"))
# remove control caracters from channel_tittle
df_all["channel_title"] = df_all["channel_title"].apply(lambda x: x.replace(" ","_"))
df_all["channel_title"] = df_all["channel_title"].apply(lambda x: quote(x,safe='/:?=&'))

print(df_all["category"].unique())

Unique values of category
['News_&_Politics' 'Entertainment' 'People_&_Blogs' 'Comedy'
 'Travel_&_Events' 'Music' 'Sports' 'Education' 'Film_&_Animation'
 'Howto_&_Style' 'Gaming' 'Science_&_Technology' 'Pets_&_Animals'
 'Autos_&_Vehicles' 'Shows' 'No_categoria' 'Movies'
 'Nonprofits_&_Activism']


## Saving the new dataset

In [1002]:
# Save dataframes to csv
df_all.to_csv(CLEAN_DATA /"videos.csv",index = False)
tag_df.to_csv(CLEAN_DATA /"tags.csv",index = False)
video_tag_df.to_csv(CLEAN_DATA /"video_tags.csv",index = False)