In [3]:
import pandas as pd
from nltk.tokenize import word_tokenize
import numpy as np

In [4]:
#Load in the datasets
categories = pd.read_json("US_category_id.json")
#print(variables.head())
usdata = pd.read_csv("USvideos.csv")
#print(usdata.head())

In [5]:
usdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 16 columns):
video_id                  40949 non-null object
trending_date             40949 non-null object
title                     40949 non-null object
channel_title             40949 non-null object
category_id               40949 non-null int64
publish_time              40949 non-null object
tags                      40949 non-null object
views                     40949 non-null int64
likes                     40949 non-null int64
dislikes                  40949 non-null int64
comment_count             40949 non-null int64
thumbnail_link            40949 non-null object
comments_disabled         40949 non-null bool
ratings_disabled          40949 non-null bool
video_error_or_removed    40949 non-null bool
description               40379 non-null object
dtypes: bool(3), int64(5), object(8)
memory usage: 4.2+ MB


In [6]:
print(usdata.shape)
print(usdata.nunique())

(40949, 16)
video_id                   6282
trending_date               205
title                      6455
channel_title              2207
category_id                  16
publish_time               6269
tags                       6055
views                     40478
likes                     29850
dislikes                   8516
comment_count             13773
thumbnail_link             6352
comments_disabled             2
ratings_disabled              2
video_error_or_removed        2
description                6901
dtype: int64


In [7]:
usdata.isna().sum()

video_id                    0
trending_date               0
title                       0
channel_title               0
category_id                 0
publish_time                0
tags                        0
views                       0
likes                       0
dislikes                    0
comment_count               0
thumbnail_link              0
comments_disabled           0
ratings_disabled            0
video_error_or_removed      0
description               570
dtype: int64

### Data cleaning

In [8]:
#Convert predictors to their correct type

#Time predictors
def convertTrendingDate(df,var,dateformat='%y.%d.%m'): 
    return pd.to_datetime(df[var],format=dateformat)

def convertPublishTime(df,var):
    """Splits time into date column and time column"""
    pub_date = pd.to_datetime(df[var]).dt.date
    pub_time = pd.to_datetime(df[var]).dt.time
    return pub_date, pub_time

def convertCategoryField(df, var, predtype="category"):
    """Converts the type of the field to specified"""
    return  df[var].astype(predtype)

def removePattern(df, var, regpattern, replacement, regex=True):
    """Removes the regex found in pattern from text and returns series"""
    return df[var].replace(regpattern, replacement, regex=regex)

def removePatternSeries(series, regpattern, replacement, regex=True):
    """Removes the regex found in pattern from text and returns series"""
    return series.replace(regpattern, replacement, regex=regex)


def getTags(df, var, regpattern, replacement, sep="|"):
    """converts tags into a list"""
    return removePattern(df, var, regpattern, replacement).str.split(sep)

def wordTokenize(df, var, method=word_tokenize):
    """Tokenizes sentences and converts to lowercase"""
    return df[var].str.lower().apply(method)

# def isCapitalized(df, var, regpattern):
    
#     if usdata["title"].str.match('[A-Z]{2,}'):
#         df[isCapital] = 1
#     else:
#         df[isCapital] = 0
#     return df

def dropCol(df, var,inplace=True,axis=1):
    df.drop(var, inplace=inplace, axis=axis)
    return df

def selectUniqueRows(df,var):
    df = df.drop_duplicates(subset=var)
    return df



In [9]:
#select unique rows
rows = ["video_id","title","description"]
usdata = selectUniqueRows(usdata, rows)

#Fix time
usdata["trending_date"]= convertTrendingDate(usdata,"trending_date")
usdata["publish_date"], usdata["publish_time"] = convertPublishTime(usdata, "publish_time")
usdata["publish_date"] = convertCategoryField(usdata,"publish_date",predtype="datetime64[ns]")

#convert category_id to categorical data
usdata["category_id"] = convertCategoryField(usdata, "category_id")

#process tags
usdata["tags"] = getTags(usdata, "tags",'\"', '')

#process title
usdata["title"]=removePatternSeries(usdata["title"], '[^\w+\s]', '')
usdata["token_title"] = wordTokenize(usdata, "title")


In [10]:
def formatDescription(df, var):
    """This function is hardcoded to the dataset. 
    It removes the url and anything following the @ symbol.
    """
    #remove anything following @ symbol and urls
    ser = removePattern(df, var, '(\@.*)|(http\S+)|(\\n)', "", regex=True)
    
    #replace \n with blank spaces
    ser = removePatternSeries(ser, '[\\n]+', " ", regex=True)

    return ser


In [11]:
#process description 

#fill in NAs
usdata["description"] = usdata["description"].fillna('')
usdata["description"] = formatDescription(usdata, "description")
usdata["token_description"] = wordTokenize(usdata, "description")

In [12]:
#drop video id, thumbnails,title, description
cols = ["video_id","title", "thumbnail_link","description"]
usdata = dropCol(usdata, cols)

In [13]:
usdata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7139 entries, 0 to 40846
Data columns (total 15 columns):
trending_date             7139 non-null datetime64[ns]
channel_title             7139 non-null object
category_id               7139 non-null category
publish_time              7139 non-null object
tags                      7139 non-null object
views                     7139 non-null int64
likes                     7139 non-null int64
dislikes                  7139 non-null int64
comment_count             7139 non-null int64
comments_disabled         7139 non-null bool
ratings_disabled          7139 non-null bool
video_error_or_removed    7139 non-null bool
publish_date              7139 non-null datetime64[ns]
token_title               7139 non-null object
token_description         7139 non-null object
dtypes: bool(3), category(1), datetime64[ns](2), int64(4), object(5)
memory usage: 697.9+ KB


In [14]:
usdata.to_pickle("clean_us.pkl")