# Pre-procesing 

In [31]:
from random import choice
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from shutil import rmtree
import pandas as pd
import json
import dateutil.parser
import os
from urllib.parse import quote

In [32]:
#Datos de kaggle
RAW_DATA = Path("raw_dataset").absolute()

#Output de datos (datos limpios)

CLEAN_DATA = Path("clean_dataset").absolute()

if  CLEAN_DATA.exists():
    rmtree(CLEAN_DATA)
    os.mkdir(CLEAN_DATA)
else:
    os.mkdir(CLEAN_DATA)

FRAC = 0.05

## Loading the data

In [33]:

countries = ["CA","DE","FR","GB","JP","KR","IN","MX","RU","US"] #Lista de países.
videos = {} #Lista con los dataframes de archivos csv
categories = {} # Lista con los diccionarios de los archivos json.
print("Abriendo archivos del dataset-->")
for i,country in enumerate(countries):
    print(f"\tPais: {country}  {i+1}/{len(countries)} ",end = "\r")
    file_csv = RAW_DATA.joinpath(f"{country}videos.csv")
    videos[country] = pd.read_csv(file_csv,encoding="ISO-8859-1",lineterminator="\n")
    videos[country].columns = [x.strip() for x in videos[country].columns]


    with open(RAW_DATA.joinpath(f"{country}_category_id.json")) as file:
        items = json.load(file)["items"]
        temp = {int(x["id"]): x for x in items} #La llave de cada categoria es su id.
        val = defaultdict(lambda: "NULL")
        val["snippet"] = defaultdict(lambda:"NULL")
        val["snippet"]["title"] = 'No categoria'
        categories[country] = defaultdict(lambda : val,temp)
print("\nDatos Cargados!")

Abriendo archivos del dataset-->
	Pais: US  10/10 
Datos Cargados!


In [34]:
# veamos el numero de nulos sin contar la descripcion de cada DF
print("Numero de filas con algun nulo (sin contar descripcion) en el DF de ")
for key,df_clean in videos.items():
    df = df_clean.copy().drop(["description"],axis = 1)
    df1 = df[df.isna().any(axis=1)]
    print(f"\tPais --> {key}: {len(df1)}")


Numero de filas con algun nulo (sin contar descripcion) en el DF de 
	Pais --> CA: 0
	Pais --> DE: 0
	Pais --> FR: 0
	Pais --> GB: 0
	Pais --> JP: 0
	Pais --> KR: 0
	Pais --> IN: 0
	Pais --> MX: 0
	Pais --> RU: 0
	Pais --> US: 0


In [35]:
f = '%Y-%m-%dT%H:%M:%S'

def get_category_name(series,key):
    return series.apply(lambda x: categories[key][x]["snippet"]["title"])

def get_publish_timestamp(series):
    return series.apply(lambda x: dateutil.parser.isoparse(x).strftime(f))

def get_trending_timestamp(series):
    return series.apply(lambda x: datetime.strptime(x,"%y.%d.%m").strftime(f))

In [36]:
for key,df in videos.items():
    df["category"] = get_category_name(df["category_id"],key) # Agregamos el nombre de la categoria
    # Parseamos las fechas para llegar y comparar como timestamps
    df["publish_timestamp"] = get_publish_timestamp(df["publish_time"]) 
    df["trending_timestamp"] = get_trending_timestamp(df["trending_date"]) 
    df.drop(["video_id","category_id","thumbnail_link"],axis =1,inplace=True)

In [37]:
print("Escribiendo nuevo dataset-->")
df_all = pd.DataFrame(columns = ["country"]+ list(videos["CA"].columns))
for i,(key,df) in enumerate(videos.items()):
    df["country"] = key
    print(f"Escribiendo {key}videos.csv   {i+1}/{len(videos.items())}",end = "\r")
    # get 10% sample of df
    # remove rows with no tags
    df = df[df["tags"] != "[none]"]
    df = df.sample(frac=FRAC)
    df_all = pd.concat([df_all,df],ignore_index = True)
# Add numeric id to each row called video_id
df_all["video_id"] = df_all.index

print("\n Dataset listo!")

Escribiendo nuevo dataset-->
Escribiendo USvideos.csv   10/10
 Dataset listo!


In [38]:
df_all.head()

Unnamed: 0,country,trending_date,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed,description,category,publish_timestamp,trending_timestamp,video_id
0,CA,18.14.02,Black Panther - Movie Review,Jeremy Jahns,2018-02-14T04:06:58.000Z,"Black Panther|""marvel""|""t'challa""|""hero""|""aven...",114780,11245,236,1700,False,False,False,King T'Challa must adjust with the responsibil...,Entertainment,2018-02-14T04:06:58,2018-02-14T00:00:00,0
1,CA,17.12.12,"Deivamagal Episode 1411, 11/12/17",VikatanTV,2017-12-11T15:00:04.000Z,"deivamagal|""deiva magal""|""deiva magal serial""|...",796126,4342,984,334,False,False,False,Deivamagal Episode 1411\nSubscribe: https://go...,Shows,2017-12-11T15:00:04,2017-12-12T00:00:00,1
2,CA,18.04.02,Jenna Fischer Discusses Her 'Office' Chemistry...,theoffcamerashow,2018-02-03T08:43:12.000Z,"Off Camera|""Sam Jones""|""Jenna Fischer""|""interv...",92394,2174,26,201,False,False,False,Actress and author Jenna Fischer talks to Sam ...,Entertainment,2018-02-03T08:43:12,2018-02-04T00:00:00,2
3,CA,18.12.03,YouTuber Back for Revenge on Dr. Phil,Triggered Tro,2018-03-07T21:00:00.000Z,"dr|""phil""|""dr phil""|""youtuber""|""tv""|""danielle""...",1207639,53862,1241,6850,False,False,False,Ex-YouTuber Returns to Dr. Phil\n\nPart 1:\n\n...,Entertainment,2018-03-07T21:00:00,2018-03-12T00:00:00,3
4,CA,18.21.03,Jenny Slate Wants Her Cat Back from Kumail Nan...,The Late Late Show with James Corden,2018-03-20T08:35:00.000Z,"The Late Late Show|""Late Late Show""|""James Cor...",73229,1024,16,32,False,False,False,James welcomes back old friends Jenny Slate an...,Entertainment,2018-03-20T08:35:00,2018-03-21T00:00:00,4


# New Tables for tags

In [39]:
tags_series = df_all["tags"].apply(lambda x: x.split("|"))
tags_series = tags_series.apply(lambda x: [y.strip() for y in x])
tags = [item for sublist in tags_series for item in sublist] # Lista con todas las tags.
new_tags = []
for tag in tags:
    # remove " from tags
    tag = tag.replace('"',"")
    # Replace spaces with _
    tag = tag.replace(" ","_")
    tag = quote(tag,safe='/:?=&')
    new_tags.append(tag)

tags = new_tags


# Count frequency of each tag
freq_dict = {}
for tag in tags:
    if tag in freq_dict:
        freq_dict[tag] += 1
    else:
        freq_dict[tag] = 1

# Remove tags that appear less than 100 times

thresh = 100

freq_dict = {key:value for key,value in freq_dict.items() if value >= thresh}
tag_set = freq_dict.keys()


# Remove from every elemnt in tag series values that are not in final_set
def remove_tags(x):
    res = []
    for elem in x:
        elem  = elem.replace('"',"")
        # Replace spaces with _
        elem = elem.replace(" ","_")
        elem = quote(elem,safe='/:?=&')
        if elem in tag_set:
            res.append(elem)

    return res

tags_series = tags_series.apply(remove_tags)



# print info

print("\n------------BEFORE------------\n")
print(f"Number of unique tags : {len(set(new_tags))}")
print(f"Total number of tags: {len(new_tags)}")
print("\n------------AFTER------------\n")

print(f"Number of unique tags : {len(tag_set)}")
print("Total number of tags.",sum(freq_dict.values()))
print("Rows without tags {}".format(len(tags_series[tags_series.apply(lambda x: len(x) == 0)])))



------------BEFORE------------

Number of unique tags : 150647
Total number of tags: 323359

------------AFTER------------

Number of unique tags : 68
Total number of tags. 11678
Rows without tags 11353


In [40]:

# create enw dataframe with 2 columns video_id and tag_id
video_tag_df = pd.DataFrame(columns = ["video_id","tag_name"],index = range(sum(freq_dict.values())))

In [41]:
counter = 0
for elem in enumerate(tags_series):
    L = elem[1]
    for tag in L:
        video_tag_df.loc[counter] = [elem[0],tag]
        counter += 1


In [42]:
df_all = df_all.drop(["tags","description"],axis = 1)

In [43]:
video_tag_df

Unnamed: 0,video_id,tag_name
0,0,movie
1,0,review
2,2,interview
3,2,love
4,3,tv
...,...,...
11673,16903,Trailer
11674,16904,Funny
11675,16904,Comedy
11676,16908,Pop


In [44]:
# show uniqute values of category
print("Unique values of category")



# result: http://www.oschina.net/search?scope=bbs&q=C%E8%AF%AD%E8%A8%80
quote('http://www.oschina.net/search?scope=bbs&q=C语言',safe='/:?=&')
# replace " " with "_"
df_all["category"] = df_all["category"].apply(lambda x: x.replace(" ","_"))
# remove control caracters from channel_tittle
df_all["channel_title"] = df_all["channel_title"].apply(lambda x: x.replace(" ","_"))
df_all["channel_title"] = df_all["channel_title"].apply(lambda x: quote(x,safe='/:?=&'))

print(df_all["category"].unique())

Unique values of category
['Entertainment' 'Shows' 'People_&_Blogs' 'Music' 'Comedy'
 'Science_&_Technology' 'Film_&_Animation' 'Howto_&_Style'
 'News_&_Politics' 'Education' 'Autos_&_Vehicles' 'Sports' 'Gaming'
 'Travel_&_Events' 'Pets_&_Animals' 'No_categoria' 'Movies']


## Saving the new dataset

In [45]:
# Save dataframes to csv
df_all.to_csv(CLEAN_DATA /"videos.csv",index = False)
video_tag_df.to_csv(CLEAN_DATA /"video_tags.csv",index = False)