# Pre-procesing 

In [419]:
from random import choice
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pandas as pd
import json
import dateutil.parser
import os

In [420]:
#Datos de kaggle
RAW_DATA = Path("raw_dataset").absolute()

#Output de datos (datos limpios)

CLEAN_DATA = Path("clean_dataset").absolute()

if  CLEAN_DATA.exists():
    rmtree(CLEAN_DATA)
    os.mkdir(CLEAN_DATA)
else:
    os.mkdir(CLEAN_DATA)

## Loading the data

In [421]:

countries = ["CA","GB","US"] #Lista de paiess.
videos = {} #Lista con los dataframes de archivos csv
categories = {} # Lista con los diccionarios de los archivos json.
print("Abriendo archivos del dataset-->")
for i,country in enumerate(countries):
    print(f"\tPais: {country}  {i+1}/{len(countries)} ",end = "\r")
    file_csv = RAW_DATA.joinpath(f"{country}videos.csv")
    videos[country] = pd.read_csv(file_csv,encoding="ISO-8859-1",lineterminator="\n")
    videos[country].columns = [x.strip() for x in videos[country].columns]


    with open(RAW_DATA.joinpath(f"{country}_category_id.json")) as file:
        items = json.load(file)["items"]
        temp = {int(x["id"]): x for x in items} #La llave de cada categoria es su id.
        val = defaultdict(lambda: "NULL")
        val["snippet"] = defaultdict(lambda:"NULL")
        val["snippet"]["title"] = 'No categoria'
        categories[country] = defaultdict(lambda : val,temp)
print("\nDatos Cargados!")

Abriendo archivos del dataset-->
	Pais: US  3/3 
Datos Cargados!


In [422]:
# veamos el numero de nulos sin contar la descripcion de cada DF
print("Numero de filas con algun nulo (sin contar descripcion) en el DF de ")
for key,df_clean in videos.items():
    df = df_clean.copy().drop(["description"],axis = 1)
    df1 = df[df.isna().any(axis=1)]
    print(f"\tPais --> {key}: {len(df1)}")


Numero de filas con algun nulo (sin contar descripcion) en el DF de 
	Pais --> CA: 0
	Pais --> GB: 0
	Pais --> US: 0


In [423]:
def show_random_head(): #Funcion para mostrar el head de alguno de los df's
    pick = choice(countries)
    print("Mostrando head de {}".format(pick))
    return videos[pick].head

In [424]:
show_random_head()() #Mostramos uno de los df

Mostrando head de GB


Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,Jw1Y-zhQURU,17.14.11,John Lewis Christmas Ad 2017 - #MozTheMonster,John Lewis,26,2017-11-10T07:38:29.000Z,"christmas|""john lewis christmas""|""john lewis""|...",7224515,55681,10247,9479,https://i.ytimg.com/vi/Jw1Y-zhQURU/default.jpg,False,False,False,Click here to continue the story and make your...
1,3s1rvMFUweQ,17.14.11,Taylor Swift: â¦Ready for It? (Live) - SNL,Saturday Night Live,24,2017-11-12T06:24:44.000Z,"SNL|""Saturday Night Live""|""SNL Season 43""|""Epi...",1053632,25561,2294,2757,https://i.ytimg.com/vi/3s1rvMFUweQ/default.jpg,False,False,False,Musical guest Taylor Swift performs â¦Ready f...
2,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. BeyoncÃ©,EminemVEVO,10,2017-11-10T17:00:03.000Z,"Eminem|""Walk""|""On""|""Water""|""Aftermath/Shady/In...",17158579,787420,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. BeyoncÃ© ...
3,PUTEiSjKwJU,17.14.11,Goals from Salford City vs Class of 92 and Fri...,Salford City Football Club,17,2017-11-13T02:30:38.000Z,"Salford City FC|""Salford City""|""Salford""|""Clas...",27833,193,12,37,https://i.ytimg.com/vi/PUTEiSjKwJU/default.jpg,False,False,False,Salford drew 4-4 against the Class of 92 and F...
4,rHwDegptbI4,17.14.11,Dashcam captures truck's near miss with child ...,Cute Girl Videos,25,2017-11-13T01:45:13.000Z,[none],9815,30,2,30,https://i.ytimg.com/vi/rHwDegptbI4/default.jpg,False,False,False,Dashcam captures truck's near miss with child ...


In [425]:
def get_category_name(series,key):
    return series.apply(lambda x: categories[key][x]["snippet"]["title"])

def get_publish_timestamp(series):
    return series.apply(lambda x: dateutil.parser.isoparse(x).timestamp())

def get_trending_timestamp(series):
    return series.apply(lambda x: datetime.strptime(x,"%y.%d.%m").timestamp())

In [426]:
for key,df in videos.items():
    df["category"] = get_category_name(df["category_id"],key) # Agregamos el nombre de la categoria
    # Parseamos las fechas para llegar y comparar como timestamps
    df["publish_timestamp"] = get_publish_timestamp(df["publish_time"]) 
    df["trending_timestamp"] = get_trending_timestamp(df["trending_date"]) 
    df.drop(["video_id","category_id","thumbnail_link"],axis =1,inplace=True)

In [427]:
print("Escribiendo nuevo dataset-->")
df_all = pd.DataFrame(columns = ["country"]+ list(videos["CA"].columns))
for i,(key,df) in enumerate(videos.items()):
    df["country"] = key
    print(f"Escribiendo {key}videos.csv   {i+1}/{len(videos.items())}",end = "\r")
    # get 10% sample of df
    # remove rows with no tags
    df = df[df["tags"] != "[none]"]
    df = df.sample(frac=0.1)
    df_all = pd.concat([df_all,df],ignore_index = True)
# Add numeric id to each row called video_id
df_all["video_id"] = df_all.index

print("\n Dataset listo!")

Escribiendo nuevo dataset-->
Escribiendo USvideos.csv   3/3
 Dataset listo!


In [428]:
df_all.head()

Unnamed: 0,country,trending_date,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,category,publish_timestamp,trending_timestamp,video_id
0,CA,18.28.04,Testing Stove In A Can,CrazyRussianHacker,2018-04-27T23:25:19.000Z,"Stove In A Can|""Stove""|""survival stove""|""survi...",220819,8398,278,869,False,False,False,Ultimate Breakfast Machine - https://youtu.be/...,Science & Technology,1524872000.0,1524884000.0,0
1,CA,18.26.04,INTERROGATING ZUCKERBERG â A Bad Lip Reading,Bad Lip Reading,2018-04-24T16:04:41.000Z,"mark zuckerberg|""facebook""|""hearing""|""congress...",2310814,138338,1437,6331,False,False,False,Tension mounted during Mark Zuckerberg's congr...,Comedy,1524586000.0,1524712000.0,1
2,CA,18.12.06,My Mom's Morning Routine,IISuperwomanII,2018-06-11T22:05:25.000Z,"iisuperwomanii|""superwoman""|""team super""|""come...",386445,55482,247,5721,False,False,False,"Like any good aunty, my mom Paramjeet hasn't c...",Comedy,1528755000.0,1528776000.0,2
3,CA,18.28.03,Comment Awards v97,Comment Awards,2018-03-26T21:10:22.000Z,"comment awards|""comment""|""awards""|""dank meme c...",566854,15452,298,3935,False,False,False,"Welcome to Comment Awards, your #1 source for ...",Comedy,1522099000.0,1522206000.0,3
4,CA,18.17.04,10 ANIMAUX QUI ONT ÃTÃ CRÃÃS PAR LE DIABLE...,Lama FachÃ©,2018-04-15T15:00:06.000Z,"10 animaux crÃ©es par le diable|""10 animaux cr...",275545,10531,2553,2287,False,False,False,Le monde animal est trÃ¨s diversifiÃ© et mÃªme...,Entertainment,1523804000.0,1523934000.0,4


# New Tables for tags

In [429]:
tags_series = df_all["tags"].apply(lambda x: x.split("|"))
tags_series = tags_series.apply(lambda x: [y.strip() for y in x])
tags = [item for sublist in tags_series for item in sublist] # Lista con todas las tags.
new_tags = []
for tag in tags:
    # remove " from tags
    tag = tag.replace('"',"")
    new_tags.append(tag)

tags = new_tags


# Count frequency of each tag
freq_dict = {}
for tag in tags:
    if tag in freq_dict:
        freq_dict[tag] += 1
    else:
        freq_dict[tag] = 1

# Remove tags that appear less than 100 times

thresh = 100

freq_dict = {key:value for key,value in freq_dict.items() if value >= thresh}
tag_set = freq_dict.keys()


# Remove from every elemnt in tag series values that are not in final_set
def remove_tags(x):
    res = []
    for elem in x:
        elem  = elem.replace('"',"")
        if elem in tag_set:
            res.append(elem)

    return res

tags_series = tags_series.apply(remove_tags)



# print info

print("\n------------BEFORE------------\n")
print(f"Number of unique tags : {len(set(new_tags))}")
print(f"Total number of tags: {len(new_tags)}")
print("\n------------AFTER------------\n")

print(f"Number of unique tags : {len(tag_set)}")
print("Total number of tags.",sum(freq_dict.values()))
print("Rows without tags {}".format(len(tags_series[tags_series.apply(lambda x: len(x) == 0)])))



------------BEFORE------------

Number of unique tags : 71146
Total number of tags: 232395

------------AFTER------------

Number of unique tags : 94
Total number of tags. 17373
Rows without tags 5391


In [430]:
# create new dataframe with 2 columns tag_id and tag_name
tag_df = pd.DataFrame(columns = ["tag_id","tag_name"],index=  range(len(tag_set)))
# create enw dataframe with 2 columns video_id and tag_id
video_tag_df = pd.DataFrame(columns = ["video_id","tag_id"],index = range(sum(freq_dict.values())))

In [431]:
id_dict = {}
for i,elem in enumerate(tag_set):
    tag_df.loc[i] = [i,elem]
    id_dict[elem] = i
counter = 0
for elem in enumerate(tags_series):
    L = elem[1]
    for tag in L:
        video_tag_df.loc[counter] = [elem[0],id_dict[tag]]
        counter += 1


In [432]:
video_tag_df

Unnamed: 0,video_id,tag_id
0,1,0
1,1,1
2,2,2
3,3,0
4,6,3
...,...,...
17368,11477,68
17369,11477,50
17370,11477,0
17371,11477,46


In [433]:
tag_df

Unnamed: 0,tag_id,tag_name
0,0,funny
1,1,parody
2,2,comedy
3,3,2018
4,4,family
...,...,...
89,89,dance
90,90,education
91,91,christmas
92,92,Pop


In [434]:
df_all = df_all.drop(["tags"],axis = 1)


Unnamed: 0,country,trending_date,title,channel_title,publish_time,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,category,publish_timestamp,trending_timestamp,video_id
0,CA,18.28.04,Testing Stove In A Can,CrazyRussianHacker,2018-04-27T23:25:19.000Z,220819,8398,278,869,False,False,False,Ultimate Breakfast Machine - https://youtu.be/...,Science & Technology,1.524872e+09,1.524884e+09,0
1,CA,18.26.04,INTERROGATING ZUCKERBERG â A Bad Lip Reading,Bad Lip Reading,2018-04-24T16:04:41.000Z,2310814,138338,1437,6331,False,False,False,Tension mounted during Mark Zuckerberg's congr...,Comedy,1.524586e+09,1.524712e+09,1
2,CA,18.12.06,My Mom's Morning Routine,IISuperwomanII,2018-06-11T22:05:25.000Z,386445,55482,247,5721,False,False,False,"Like any good aunty, my mom Paramjeet hasn't c...",Comedy,1.528755e+09,1.528776e+09,2
3,CA,18.28.03,Comment Awards v97,Comment Awards,2018-03-26T21:10:22.000Z,566854,15452,298,3935,False,False,False,"Welcome to Comment Awards, your #1 source for ...",Comedy,1.522099e+09,1.522206e+09,3
4,CA,18.17.04,10 ANIMAUX QUI ONT ÃTÃ CRÃÃS PAR LE DIABLE...,Lama FachÃ©,2018-04-15T15:00:06.000Z,275545,10531,2553,2287,False,False,False,Le monde animal est trÃ¨s diversifiÃ© et mÃªme...,Entertainment,1.523804e+09,1.523934e+09,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11477,US,18.06.02,"Melissa McCarthy's New Dogs Are Pretty, But No...",TheEllenShow,2018-02-05T14:01:30.000Z,590868,12092,145,293,False,False,False,The always hilarious Melissa McCarthy talked w...,Entertainment,1.517839e+09,1.517886e+09,11477
11478,US,18.10.06,Joey Graceffa's Enchanted Gaming Room Makeover...,Mr. Kate,2018-05-24T22:00:05.000Z,1427286,73320,661,6029,False,False,False,Big thanks to DazzlePro for sponsoring and get...,Howto & Style,1.527199e+09,1.528603e+09,11478
11479,US,18.16.03,The Logistics of Living in Antarctica,Wendover Productions,2018-03-06T15:30:00.000Z,803414,25205,446,2091,False,False,False,Get 20% off Brilliant premium by being one of ...,Education,1.520350e+09,1.521169e+09,11479
11480,US,17.03.12,Rolled Ice Cream DIY How to make rolled ice cr...,How To Cook That,2017-12-01T08:30:32.000Z,200192,7659,179,963,False,False,False,"Rolled Ice Cream, the secrets to making ice cr...",Howto & Style,1.512117e+09,1.512270e+09,11480


## Saving the new dataset

In [435]:
# Save dataframes to csv
df_all.to_csv(CLEAN_DATA /"videos.csv",index = False)
tag_df.to_csv(CLEAN_DATA /"tags.csv",index = False)
video_tag_df.to_csv(CLEAN_DATA /"video_tags.csv",index = False)