In [1]:
import pandas as pd
import numpy as np
import time

In [2]:
#read in the list of videos contained in the csv
df = pd.read_csv('vidz.csv')

In [3]:
#drop columns with redundant information
df = df.drop(columns=['comments_disabled', 'ratings_disabled' ,'thumbnail_link'])

In [43]:
df.categoryId = df.categoryId.astype('object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   video_id       200 non-null    object 
 1   title          200 non-null    object 
 2   publishedAt    200 non-null    object 
 3   channelTitle   200 non-null    object 
 4   categoryId     200 non-null    int64  
 5   trending_date  200 non-null    object 
 6   tags           200 non-null    object 
 7   view_count     200 non-null    int64  
 8   likes          200 non-null    int64  
 9   comment_count  199 non-null    float64
 10  description    200 non-null    object 
 11  duration       200 non-null    object 
dtypes: float64(1), int64(3), object(8)
memory usage: 18.9+ KB


## Convert to datetime --> ppublised at, trending date, duration (timedelta)

In [4]:
#convert publishedAT to datetime column
df.publishedAt = pd.to_datetime(df.publishedAt, utc=True)
df.publishedAt

0     2022-10-26 22:00:09+00:00
1     2022-10-26 15:58:27+00:00
2     2022-10-27 00:18:46+00:00
3     2022-10-27 00:00:09+00:00
4     2022-10-26 05:06:26+00:00
                 ...           
195   2022-10-21 20:48:29+00:00
196   2022-10-20 20:00:01+00:00
197   2022-10-21 04:01:22+00:00
198   2022-10-23 16:00:08+00:00
199   2022-10-20 09:43:11+00:00
Name: publishedAt, Length: 200, dtype: datetime64[ns, UTC]

In [7]:
#set date to next day in order to capture videos released at different times in different time zones
df.trending_date = '22.28.10'

In [8]:
#convert trending to datetime column
df.trending_date = pd.to_datetime(df.trending_date, format='%y.%d.%m', utc=True)
df.trending_date

0     2022-10-28 00:00:00+00:00
1     2022-10-28 00:00:00+00:00
2     2022-10-28 00:00:00+00:00
3     2022-10-28 00:00:00+00:00
4     2022-10-28 00:00:00+00:00
                 ...           
195   2022-10-28 00:00:00+00:00
196   2022-10-28 00:00:00+00:00
197   2022-10-28 00:00:00+00:00
198   2022-10-28 00:00:00+00:00
199   2022-10-28 00:00:00+00:00
Name: trending_date, Length: 200, dtype: datetime64[ns, UTC]

In [11]:
#import tzinfo inorder to strip time zone information from published at. 
#this makes it a "naive" datetime object. may want to change this approach
# from datetime import tzinfo

# df.loc[1,'publishedAt'].replace(tzinfo=None)
#strips the timezone from each row
# for n in range(0,200):
#     df.loc[n,'publishedAt']= df.loc[n,'publishedAt'].replace(tzinfo=None)
# df.loc[10, ['publishedAt']]

In [12]:
#create age column. may want to give timezone info to trending date instead of removing it from pblishedAt
df['age']=(df.trending_date - df.publishedAt)

In [13]:
df.age.sort_values().head(10)

10   0 days 19:55:54
18   0 days 19:59:53
15   0 days 21:59:50
11   0 days 23:40:21
2    0 days 23:41:14
3    0 days 23:59:51
21   0 days 23:59:52
6    1 days 01:48:57
0    1 days 01:59:51
8    1 days 02:59:52
Name: age, dtype: timedelta64[ns]

# View:Like ratio that can score the video | view:comment ratio

Have them all as a weighted ratio

df['engagement'] = (df.likes + df.comment_count * 4 )/df.view_count

In [14]:
#creates engagement metric. not sure how to do weightd columns really
#df['engagement'] = (df.view_count - df.likes) + (df.likes * 2) + (df.comment_count * 4) 
df['engagement'] = (df.likes + df.comment_count * 4 )/df.view_count

In [15]:
df.engagement

0      0.137897
1      0.040702
2      0.100914
3      0.099933
4      0.016842
         ...   
195    0.085803
196    0.041752
197    0.094759
198    0.081997
199    0.040904
Name: engagement, Length: 200, dtype: float64

## Is the video sponsored?


In [44]:
#adds sponsored column based on appearance of word sponsored in the description
df['sponsored'] = np.where(df.description.str.contains('sponsor'), 1, 0)
df[df.description.str.contains('sponsored')].shape

(6, 20)

## Video language


Percent of capital letters in title 


Age restricted


When we talk about subscribers we can take a look at age of channel


How to drive up subscribers is a slightly separate question but we can ask it


At what point does the video view count pass the subscriber view count


We can target placing videos in/out of the top 25 


## Let’s think about tags and how many words they have in common with the descriptions


In [17]:
#countes number of tags given to video BEFORE stripping out extraneous things
df['num_of_tags'] = df.tags.str.split('|').str.len()

In [18]:
#gets rid of separator
df.tags = df.tags.str.replace('|'," ")

  df.tags = df.tags.str.replace('|'," ")


In [19]:
#Imports spacy
import spacy

In [20]:
#creates the nlp object that is going to do the heavy lifting
nlp = spacy.load("en_core_web_sm")

In [21]:
#uses the nlp object to convert the input text into a doc
doc = nlp(df.loc[0].tags)

In [22]:
#goes through tokens (words) in each doc
for token in doc:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

# Iterate over the predicted entities
for ent in doc.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

lil         PROPN     nmod      
durk        PROPN     amod      
lil         PROPN     nmod      
durk        PROPN     compound  
music       PROPN     nmod      
lil         PROPN     nummod    
durk        PROPN     compound  
music       NOUN      compound  
video       NOUN      nsubj     
just        ADV       advmod    
cause       VERB      mark      
y           PRON      nsubj     
all         PRON      appos     
waited      VERB      ROOT      
2           NUM       nummod    
lil         NOUN      compound  
durk        PROPN     dobj      
2020        NUM       nummod    
just        ADV       advmod    
cause       VERB      mark      
y'          PRON      nsubj     
all         PRON      appos     
waited      VERB      advcl     
2           NUM       nummod    
durkio      NOUN      dobj      
smurkio     VERB      conj      
lil         PROPN     compound  
durk        PROPN     amod      
official    ADJ       amod      
drill       NOUN      dobj      
drill     

In [23]:
#creates set of unique words in doc 
tokens = set()
for token in doc:
    tokens.add(token.text)
print(tokens)

{'music', 'drill', 'y', 'durkio', 'hip', '2', 'video', 'family', 'smurkio', 'the', 'OTF', 'only', 'waited', 'lil', 'cause', 'all', 'just', "y'", 'durk', 'official', 'hop', 'chicago', '2020'}


In [25]:
docb = nlp(df.loc[4].description)

In [26]:
for token in docb:
    # Get the token text, part-of-speech tag and dependency label
    token_text = token.text
    token_pos = token.pos_
    token_dep = token.dep_
    # This is for formatting only
    print(f"{token_text:<12}{token_pos:<10}{token_dep:<10}")

# Iterate over the predicted entities
for ent in docb.ents:
    # Print the entity text and its label
    print(ent.text, ent.label_)

The         DET       det       
Inside      PROPN     compound  
fellas      NOUN      nsubj     
break       VERB      ROOT      
down        ADP       prt       
the         DET       det       
Klay        PROPN     compound  
-           PUNCT     punct     
Dbook       NOUN      compound  
interaction NOUN      dobj      
and         CCONJ     cc        
recap       VERB      conj      
Warriors    PROPN     compound  
-           PUNCT     punct     
Suns        PROPN     dobj      
on          ADP       prep      
TNT         PROPN     pobj      
.           PUNCT     punct     
Watch       VERB      ROOT      
highlights  NOUN      dobj      
from        ADP       prep      
Inside      ADP       prep      
the         DET       det       
NBA         PROPN     pobj      
with        ADP       prep      
Shaq        PROPN     pobj      
,           PUNCT     punct     
Charles     PROPN     compound  
Barkley     PROPN     conj      
,           PUNCT     punct     
Kenny     

In [27]:
tokens = set()
for token in docb:
    if token.pos_ not in ['SYM', 'PUNCT', 'DET']:
        print(token.pos_)
        tokens.add(token.text)

for ent in docb.ents:
    print(ent.text, ent.label_)

PROPN
NOUN
VERB
ADP
PROPN
NOUN
NOUN
CCONJ
VERB
PROPN
PROPN
ADP
PROPN
VERB
NOUN
ADP
ADP
PROPN
ADP
PROPN
PROPN
PROPN
PROPN
PROPN
CCONJ
PROPN
PROPN
CCONJ
ADJ
VERB
ADV
PART
AUX
VERB
ADP
ADJ
NOUN
X
SPACE
PROPN
ADP
PROPN
ADP
PROPN
SPACE
VERB
PROPN
ADP
PROPN
ADP
PROPN
NOUN
SPACE
ADP
PROPN
ADP
PROPN
ADP
PROPN
NOUN
SPACE
VERB
PROPN
ADP
PROPN
ADP
PROPN
X
Warriors-Suns ORG
TNT ORG
NBA ORG
Charles Barkley PERSON
Kenny Smith PERSON
Ernie Johnson PERSON
NBA ORG
TNT ORG
TNT ORG
NBA ORG
TNT ORG
TNT ORG
Instagram ORG
{'Follow', 'now', 'Like', 'and', 'NBA', 'Barkley', 'Dbook', 'Shaq', 'Facebook', 'on', 'Kenny', 'Suns', 'TNT', 'https://www.instagram.com/nbaontnt/?hl=en', '\r', 'Subscribe', 'be', 'Connect', 'latest', 'highlights', 'to', 'Ernie', '\r\r', 'more', 'updated', 'Watch', 'videos', 'down', 'Klay', 'https://www.youtube.com/nbaontnt?sub_confirmation=1', 'https://twitter.com/NBAonTNT', 'break', 'recap', 'Johnson', 'interaction', 'Inside', 'from', 'with', 'Instagram', 'https://www.facebook.com/NBAONT

In [28]:
for ent in docb.ents:
    print(ent.text, ent.label_)

Warriors-Suns ORG
TNT ORG
NBA ORG
Charles Barkley PERSON
Kenny Smith PERSON
Ernie Johnson PERSON
NBA ORG
TNT ORG
TNT ORG
NBA ORG
TNT ORG
TNT ORG
Instagram ORG


In [24]:
def clean_text(text):
    """ 
    Purpose:
        to clean text input into function by removing duplicate words, punctuations, and other things
    ---
    Parameters:
        text: a string
    ---
    Returns:
        tokens: a set of words found in the input text
    """

    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    tokens = set()
    
    for token in doc:
        if token.pos_ not in ['SYM', 'PUNCT', 'DET']:
            tokens.add(token.text)

    for ent in doc.ents:
        tokens.add(ent.text)

    return tokens
    
    

In [31]:
df['cleaned_tags'] =  df['description'].apply(lambda x: clean_text(x))

In [29]:
df['cleaned_desc'] = df['description'].apply(lambda x: clean_text(x))

## Think about combining the countries top 25 lists and control for duplicates. 
* This way we can classify what videos have been a top 25 video 


In [None]:
#create rank and top 25 categories

In [38]:
df['rank'] = df.index + 1

In [41]:
df['top_25'] = np.where(df['rank'] < 26, 1, 0)

In [42]:
df.top_25

0      1
1      1
2      1
3      1
4      1
      ..
195    0
196    0
197    0
198    0
199    0
Name: top_25, Length: 200, dtype: int64