In [2]:
import pandas as pd  
import numpy as np
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords

#  Data Preparation

In [3]:
data = pd.read_csv("USvideos.csv")

In [4]:
data = data[data.tags !='[none]']

In [5]:
data.columns.values

array(['video_id', 'title', 'channel_title', 'category_id', 'tags',
       'views', 'likes', 'dislikes', 'comment_total', 'date'], dtype=object)

In [6]:
data.shape

(7507, 10)

In [16]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size = 0.2, random_state=0)

train, val = train_test_split(train, test_size = 0.25, random_state=0)

In [17]:
train.shape

(4503, 10)

In [18]:
val.shape

(1502, 10)

In [19]:
test.shape

(1502, 10)

In [20]:
def tags_to_phrases( raw_tags ):
    # Function to convert a raw tags to a string of phrases
    # The input is a single string (a raw video tags), and 
    # the output is a single string (a preprocessed video tags)
     
    phrases = raw_tags.lower().split('|')                           
    
    # In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set
    # to improve execution time this conversion should be done once
    stops = set(stopwords.words("english"))                  
     
    # Remove stop words
    meaningful_words = [w for w in phrases if not w in stops]   
    
    # Join the words back into one string separated by space, 
    # and return the result.
    return( ",".join( meaningful_words ))  

In [21]:
# Initialize an empty list to hold the clean tags
clean_train_tags = []

In [22]:
for key in train['tags']:
    clean_train_tags.append( tags_to_phrases(key))

In [23]:
clean_train_tags[1]

'google lens,chromebook,chrome os,google store,google home,pixel,pixel xl,daydream,vr,ar,android,made by google,october 4th,oct. 4th,oct 4th,launch,new,hardware,phone,phones,devices,device,living room,home,chromecast,accessories,google hardware,tech annoucement,10/4,#madebygoogle,google phone,launch,headset,google,max,mini,clips,pixelbook,buds,google store,project fi,ml,machine learning'

In [24]:
len(clean_train_tags)

4503

In [29]:
clean_val_tags = []

print ("Cleaning and parsing the validation set tags...\n")
for key in val['tags']:
    clean_val_tags.append(tags_to_phrases(key))

Cleaning and parsing the validation set tags...



In [26]:
clean_test_tags = []

print ("Cleaning and parsing the test set tags...\n")
for key in test['tags']:
    clean_test_tags.append(tags_to_phrases(key))

Cleaning and parsing the test set tags...



# Data Visualization

In [27]:
clean_data_tags = []

for key in data['tags']:
    clean_data_tags.append(tags_to_phrases(key))

In [28]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt


text = ','.join(clean_data_tags)

wordcloud = WordCloud(background_color='white',
                          width=800,
                          height=400
                         ).generate(text)

plt.figure(figsize=(24,14), dpi=1200)
plt.imshow(wordcloud)
plt.axis('off')
plt.savefig('cloud.png')

# Modeling

## 1. First attempt: using words in tags to predict 'views' 
Model: random forest + neuralnets


In [30]:
print ("Creating the bag of words...\n")
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the "CountVectorizer" object, which is scikit-learn's
# bag of words tool.  
vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000) 
# Note that CountVectorizer comes with its own options to automatically do preprocessing, tokenization, and stop word removal -- for each of these, instead of specifying "None", we could have used a built-in method or specified our own function to use.

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of 
# strings.
train_data_features = vectorizer.fit_transform(clean_train_tags)

# Numpy arrays are easy to work with, so convert the result to an 
# array
train_data_features = train_data_features.toarray()

Creating the bag of words...



In [31]:
# Take a look at the words in the vocabulary
vocab = vectorizer.get_feature_names()
print (vocab)



In [32]:
# Sum up the counts of each vocabulary word
dist = np.sum(train_data_features, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zip(vocab, dist):
    print (count, tag)

(83, u'00')
(9, u'00z')
(30, u'04')
(20, u'08282016ntflxuscan')
(21, u'09')
(6, u'09t20')
(180, u'10')
(31, u'100')
(10, u'1000')
(10, u'10000')
(16, u'101')
(5, u'1017')
(6, u'103')
(5, u'1080')
(9, u'10m')
(37, u'11')
(8, u'116')
(53, u'12')
(17, u'13')
(8, u'13th')
(5, u'14')
(5, u'15')
(12, u'17')
(6, u'172')
(5, u'1727')
(7, u'1728')
(6, u'1922')
(12, u'1987')
(7, u'1d')
(5, u'2015')
(31, u'2016')
(615, u'2017')
(26, u'2018')
(21, u'2049')
(6, u'2050')
(9, u'20th')
(5, u'216')
(8, u'217')
(21, u'24')
(33, u'25')
(5, u'25th')
(22, u'29')
(15, u'2d')
(20, u'30')
(27, u'360')
(8, u'365')
(6, u'3am')
(12, u'3d')
(18, u'40')
(26, u'43')
(8, u'45m')
(9, u'49ers')
(15, u'4k')
(19, u'4th')
(15, u'50')
(8, u'500')
(9, u'50mm')
(5, u'55')
(9, u'60')
(10, u'64')
(7, u'69th')
(36, u'73')
(9, u'797')
(16, u'80s')
(9, u'85')
(20, u'90s')
(8, u'91')
(9, u'93')
(6, u'98')
(28, u'a24')
(6, u'a321')
(26, u'aaron')
(12, u'aarp')
(8, u'ab')
(13, u'abandoned')
(54, u'abc')
(5, u'abdul')
(33, u'about')

(8, u'drug')
(15, u'drugs')
(48, u'drugstore')
(9, u'drunk')
(9, u'dry')
(42, u'dt')
(6, u'dub')
(11, u'duck')
(44, u'dude')
(7, u'dumbledore')
(30, u'dunkin')
(12, u'dunn')
(12, u'durability')
(8, u'duran')
(18, u'durant')
(23, u'dusk')
(16, u'dustin')
(30, u'dwayne')
(33, u'dwts')
(15, u'dwyane')
(9, u'dxo')
(15, u'dxomark')
(12, u'dylan')
(6, u'dynasty')
(7, u'e30')
(97, u'earth')
(38, u'earthquake')
(22, u'east')
(159, u'easy')
(55, u'eat')
(6, u'eater')
(80, u'eating')
(18, u'eats')
(42, u'ebay')
(11, u'ebe')
(11, u'ebro')
(24, u'echosmith')
(10, u'economy')
(56, u'ed')
(6, u'eden')
(14, u'edge')
(6, u'edgy')
(10, u'edible')
(47, u'edition')
(6, u'editor')
(29, u'edm')
(94, u'education')
(18, u'educational')
(6, u'edward')
(15, u'effect')
(27, u'effects')
(41, u'efron')
(8, u'egg')
(12, u'eiffel')
(8, u'eilish')
(7, u'einstein')
(5, u'eire')
(5, u'eisner')
(5, u'elba')
(15, u'elders')
(16, u'eldredge')
(6, u'eldridge')
(17, u'election')
(7, u'elections')
(39, u'electric')
(5, u'el

(9, u'remove')
(8, u'remover')
(22, u'ren')
(6, u'renewable')
(9, u'repeal')
(5, u'replace')
(8, u'replica')
(7, u'replicant')
(18, u'replicas')
(9, u'report')
(9, u'reporter')
(14, u'reporting')
(10, u'republic')
(12, u'republicans')
(9, u'reputation')
(16, u'request')
(57, u'rescue')
(7, u'rescuers')
(11, u'research')
(5, u'reserve')
(5, u'respond')
(14, u'responds')
(20, u'restaurant')
(6, u'restaurants')
(7, u'retail')
(6, u'retina')
(6, u'retired')
(5, u'retriever')
(15, u'retro')
(34, u'reunion')
(7, u'reveal')
(5, u'reverse')
(405, u'review')
(43, u'reviews')
(10, u'rexha')
(16, u'rey')
(10, u'reynolds')
(8, u'rhec')
(160, u'rhett')
(18, u'rice')
(21, u'rich')
(13, u'richard')
(174, u'rick')
(5, u'rickandmorty')
(67, u'rico')
(9, u'ride')
(5, u'rider')
(7, u'rides')
(11, u'ridiculous')
(51, u'ridley')
(5, u'rifle')
(13, u'right')
(19, u'rights')
(145, u'rihanna')
(13, u'riihimaki')
(6, u'riker')
(8, u'ring')
(8, u'rings')
(11, u'rip')
(5, u'ripped')
(8, u'rita')
(17, u'river')
(

Random forest with n_estimators = 18

In [33]:
print ("Training the random forest...")
from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestRegressor(n_estimators = 18) 

# Fit the forest to the training set, using the bag of words as 
# features and the views labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit( train_data_features, train["views"] )

Training the random forest...


In [34]:
# Get a bag of words for the test set, and convert to a numpy array
val_data_features = vectorizer.transform(clean_val_tags)
np.asarray(val_data_features)

# Use the random forest to make sentiment label predictions
print ("Predicting validation labels...\n")
result = forest.predict(val_data_features)

# Copy the results to a pandas dataframe with an "id" column and
# a "views" column
output = pd.DataFrame( data={"video_id":val["video_id"], "views":val["views"], "views_preds":result} )


Predicting validation labels...



In [35]:
mse = np.mean((output['views'] - output['views_preds'])**2)

In [36]:
print (mse)

2.81221310464e+12


Neural net

In [37]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,),max_iter=500)
neuralnets = mlp.fit(train_data_features, train["views"] )

In [38]:
result_n = neuralnets.predict(val_data_features)
output_n = pd.DataFrame( data={"video_id":val["video_id"], "views":val["views"], "views_preds":result_n} )

In [39]:
output_n

Unnamed: 0,video_id,views,views_preds
3633,xGuGjvIfof8,168468,294117
2554,b-znn2eQL08,380526,338457
6241,pSMCldcVGgA,16268,33098
6486,abeF5zQbQEM,393046,440886
1153,zcqZHYo7ONs,994795,1077745
6745,qEp3rwe3KmI,18419,6104
5361,D6oaRAgdslE,907723,681981
98,OudFElslbh4,1617247,6104
5832,Qg7YzqCRoZ0,186067,233063
3986,yAZwhN-WKGg,21837,12854


In [40]:
mse_n = np.mean((output_n['views'] - output_n['views_preds'])**2)

In [41]:
mse_n

3356153597270.9614

Ensemble method: averaging

In [42]:
ensem = (output_n['views_preds']  + output['views_preds']) /2

In [43]:
mse_two = np.mean((output_n['views'] - ensem)**2)

In [44]:
mse_two

2754967252219.076

## Second attempt: using phrases in tags to predict 'views'
Model: random forest + neuralnets


In [45]:
k = ','.join(clean_train_tags)
k = k.split(",")

In [46]:
from collections import Counter
import re
word_count = Counter(k)

In [47]:
top5000 = word_count.most_common(5000)
top5000

[('funny', 488),
 ('comedy', 423),
 ('2017', 202),
 ('music', 198),
 ('news', 184),
 ('how to', 179),
 ('trailer', 176),
 ('makeup', 170),
 ('food', 165),
 ('celebrity', 165),
 ('vlog', 163),
 ('humor', 156),
 ('tutorial', 154),
 ('science', 153),
 ('review', 150),
 ('pop', 149),
 ('beauty', 146),
 ('interview', 145),
 ('video', 137),
 ('halloween', 129),
 ('hollywood', 125),
 ('diy', 120),
 ('fashion', 108),
 ('television', 104),
 ('recipe', 100),
 ('sketch', 99),
 ('late night', 99),
 ('live', 98),
 ('comedian', 96),
 ('movie', 93),
 ('celebrities', 92),
 ('film', 92),
 ('entertainment', 90),
 ('best', 89),
 ('youtube', 88),
 ('nbc', 87),
 ('cooking', 86),
 ('official', 86),
 ('talk show', 86),
 ('records', 84),
 ('music video', 84),
 ('funny video', 84),
 ('donald trump', 83),
 ('sports', 83),
 ('buzzfeed', 81),
 ('famous', 79),
 ('new', 78),
 ('apple', 77),
 ('lifestyle', 76),
 ('fun', 75),
 ('nfl', 75),
 ('official trailer', 73),
 ('tv', 71),
 ('new york', 71),
 ('alternative', 70

In [48]:
lst = []
for value, count in top5000:
    lst.append(value)

In [49]:
freq_train = []
for word in lst:
    l = []
    for i in clean_train_tags:
        splits = i.split(",")
        if word in splits:
            l.append(1)
        else:
            l.append(0)
    l = np.asarray(l)
    freq_train.append(l)

In [50]:
freq__train_table = pd.DataFrame(freq_train)
freq__train_table = pd.DataFrame.transpose(freq__train_table)
freq__train_array = freq__train_table.values

Random Forest

In [104]:
print ("Training the random forest...")
from sklearn.ensemble import RandomForestRegressor

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestRegressor(n_estimators = 5) 

# Fit the forest to the training set, using the bag of words as 
# features and the views labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(freq__train_array, train["views"] )

Training the random forest...


In [105]:
freq = []
for word in lst:
    l = []
    for i in clean_val_tags:
        splits = i.split(",")
        if word in splits:
            l.append(1)
        else:
            l.append(0)
    l = np.asarray(l)
    freq.append(l)
    
freq_array = np.asarray(freq)
freq_table = pd.DataFrame(freq)
freq_table = pd.DataFrame.transpose(freq_table)
freq_array = freq_table.values

In [106]:
result = forest.predict(freq_array)

# Copy the results to a pandas dataframe with an "id" column and
# a "views" column
output = pd.DataFrame( data={"video_id":val["video_id"], "views":val["views"], "views_preds":result} )

In [107]:
mse = np.mean((output['views'] - output['views_preds'])**2)

In [108]:
print (mse)

3.09146239283e+12


Neural network

In [95]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,),max_iter=500)
neuralnets = mlp.fit(freq__train_array, train["views"] )

In [96]:
result_n = neuralnets.predict(freq_array)

output_n = pd.DataFrame( data={"video_id":val["video_id"], "views":val["views"], "views_preds":result} )

In [97]:
mse_n = np.mean((output_n['views'] - output_n['views_preds'])**2)

In [98]:
mse_n

3195527762453.9126

Ensemble : averaging

In [109]:
ensem = (output_n['views_preds']  + output['views_preds']) /2

In [110]:
mse_two = np.mean((output_n['views'] - ensem)**2)

In [111]:
mse_two

3017388949132.5747

Since the model using words in our first attempt has better performance on the validation data, we choose this model to make our prediction.

# Use the test data to evaluate the performance of the chosen model

In [112]:
test_data_features = vectorizer.transform(clean_test_tags)
np.asarray(test_data_features)


print ("Predicting test labels...\n")
result = forest.predict(test_data_features)
output = pd.DataFrame( data={"video_id":test["video_id"], "views":test["views"], "views_preds":result} )
result_n = neuralnets.predict(test_data_features)
output_n = pd.DataFrame( data={"video_id":test["video_id"], "views":test["views"], "views_preds":result_n} )

ensem = (output_n['views_preds']  + output['views_preds']) /2

mse_two = np.mean((output_n['views'] - ensem)**2)
mse_two

Predicting test labels...



6184319423547.551