### Import our dependencies

In [1]:
import pandas as pd
import matplotlib as plt
from sklearn.datasets import make_blobs
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker
import matplotlib.colors as mc
import statsmodels.formula.api as sm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

### Import and read csv files

In [2]:
model_df = pd.read_csv('model_df.csv')

### Create a discrete variable into a bucket

In [3]:
# Create discrete variable to bucket videos with than 1 million views that includes the top 100 tags
model_df['More_than_1M_views'] = np.where(model_df['viewers'] > 100000, 1, 0)
model_df

Unnamed: 0.1,Unnamed: 0,tags,viewers,has_funny_tag,has_comedy_tag,has_how to_tag,has_music_tag,has_Pop_tag,has_2018_tag,has_humor_tag,...,has_cosmetics_tag,has_NBC TV_tag,has_cartoon_tag,has_snl_tag,has_highlight_tag,has_politics_tag,has_lol_tag,has_Music_tag,has_laugh_tag,More_than_1M_views
0,0,SHANtell martin,748374,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,last week tonight trump presidency|last week t...,2418783,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,racist superman|rudy|mancuso|king|bach|racist|...,3191434,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3,rhett and link|gmm|good mythical morning|rhett...,343168,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,4,ryan|higa|higatv|nigahiga|i dare you|idy|rhpc|...,2095731,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,40944,aarons animals|aarons|animals|cat|cats|kitten|...,1685609,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40945,40945,[none],1064798,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40946,40946,I gave safiya nygaard a perfect hair makeover ...,1066451,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40947,40947,Black Panther|HISHE|Marvel|Infinity War|How It...,5660813,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Split the data into X and y
y = model_df['More_than_1M_views']
X = model_df.drop(['viewers', 'tags', 'More_than_1M_views', 'Unnamed: 0'], axis = 1)
X = sm.add_constant(X)

In [5]:
X.sum()

const                40949.0
has_funny_tag         4711.0
has_comedy_tag        3310.0
has_how to_tag        3149.0
has_music_tag         3172.0
                      ...   
has_highlight_tag     1064.0
has_politics_tag       423.0
has_lol_tag            429.0
has_Music_tag         1419.0
has_laugh_tag          585.0
Length: 101, dtype: float64

In [6]:
y.sum()

35619

In [7]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

### Print Logistic Regression Results

In [8]:
logit_model = sm.Logit(y, X)
logit_res = logit_model.fit()
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.356508
         Iterations 9


0,1,2,3
Dep. Variable:,More_than_1M_views,No. Observations:,40949.0
Model:,Logit,Df Residuals:,40848.0
Method:,MLE,Df Model:,100.0
Date:,"Thu, 21 Jan 2021",Pseudo R-squ.:,0.07806
Time:,18:35:41,Log-Likelihood:,-14599.0
converged:,True,LL-Null:,-15835.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6341,0.022,72.703,0.000,1.590,1.678
has_funny_tag,0.6048,0.144,4.198,0.000,0.322,0.887
has_comedy_tag,-0.0455,0.092,-0.495,0.621,-0.226,0.135
has_how to_tag,-0.0778,0.073,-1.071,0.284,-0.220,0.065
has_music_tag,0.2599,0.080,3.243,0.001,0.103,0.417
has_Pop_tag,0.6326,0.111,5.716,0.000,0.416,0.849
has_2018_tag,0.5821,0.064,9.140,0.000,0.457,0.707
has_humor_tag,0.4068,0.197,2.069,0.039,0.022,0.792
has_food_tag,0.7367,0.115,6.388,0.000,0.511,0.963


### Find tags that has a significant p value

In [9]:
significant_tags = []
for tag, pvalue in dict(logit_res.params).items():
    if pvalue < 0.05:
        print(tag)
        significant_tags.append(tag)

has_comedy_tag
has_how to_tag
has_news_tag
has_celebrity_tag
has_video_tag
has_tutorial_tag
has_live_tag
has_interview_tag
has_cooking_tag
has_celebrities_tag
has_fun_tag
has_movie_tag
has_cute_tag
has_Comedy_tag
has_2017_tag
has_NBC_tag
has_hollywood_tag
has_sports_tag
has_jokes_tag
has_recipe_tag
has_sketch_tag
has_documentary_tag
has_reaction_tag
has_technology_tag
has_family_tag
has_show_tag
has_funny videos_tag
has_christmas_tag
has_games_tag
has_DIY_tag
has_drama_tag
has_kitchen_tag
has_film_tag
has_love_tag
has_joke_tag
has_vlogger_tag
has_cook_tag
has_CBS_tag
has_comedic_tag
has_host_tag
has_parody_tag
has_NBC TV_tag
has_cartoon_tag
has_highlight_tag
has_politics_tag
has_lol_tag
has_laugh_tag


In [10]:
# Find number of significant tags
len(significant_tags)

47

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# # Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Define the logistic regression model
log_classifier = LogisticRegression()

In [13]:
# Train the model
log_classifier.fit(X_train_scaled,y_train)

LogisticRegression()

In [14]:
# Returns a NumPy Array
# Predict for One Observation
log_classifier.predict(X_test_scaled)

array([1, 1, 0, ..., 1, 1, 1])

In [15]:
# Use score method to get accuracy of model
score = log_classifier.score(X_test_scaled, y_test)
print(f"logistic regression model accuracy: {score}")

logistic regression model accuracy: 0.8759462759462759


In [16]:
# Define the random forest model
rfc = RandomForestClassifier(max_depth=2, random_state=0)

In [17]:
# Train the model
rfc.fit(X_train_scaled,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [18]:
# Predict for one observation
y_predict = rfc.predict(X_test_scaled)
print(f" random forest model accuracy: {accuracy_score(y_test,y_predict):.3f}")

 random forest model accuracy: 0.876


In [19]:
# Create discrete variable to bucket videos with than 1 million views that includes the top 100 tags
model_df['More_than_5M_views'] = np.where(model_df['viewers'] > 500000, 1, 0)
model_df

Unnamed: 0.1,Unnamed: 0,tags,viewers,has_funny_tag,has_comedy_tag,has_how to_tag,has_music_tag,has_Pop_tag,has_2018_tag,has_humor_tag,...,has_NBC TV_tag,has_cartoon_tag,has_snl_tag,has_highlight_tag,has_politics_tag,has_lol_tag,has_Music_tag,has_laugh_tag,More_than_1M_views,More_than_5M_views
0,0,SHANtell martin,748374,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
1,1,last week tonight trump presidency|last week t...,2418783,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
2,2,racist superman|rudy|mancuso|king|bach|racist|...,3191434,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
3,3,rhett and link|gmm|good mythical morning|rhett...,343168,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,4,ryan|higa|higatv|nigahiga|i dare you|idy|rhpc|...,2095731,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,40944,aarons animals|aarons|animals|cat|cats|kitten|...,1685609,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
40945,40945,[none],1064798,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
40946,40946,I gave safiya nygaard a perfect hair makeover ...,1066451,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1
40947,40947,Black Panther|HISHE|Marvel|Infinity War|How It...,5660813,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [20]:
# Split the data into X and y
y = model_df['More_than_5M_views']
X = model_df.drop(['viewers', 'tags','More_than_1M_views', 'More_than_5M_views', 'Unnamed: 0'], axis = 1)
X = sm.add_constant(X)

In [21]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [22]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# # Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Define the logistic regression model
log_classifier = LogisticRegression()

In [24]:
# Train the model
log_classifier.fit(X_train_scaled,y_train)

LogisticRegression()

In [25]:
# Returns a NumPy Array
# Predict for One Observation
log_classifier.predict(X_test_scaled)

array([1, 1, 0, ..., 0, 1, 1])

In [26]:
# Use score method to get accuracy of model
score = log_classifier.score(X_test_scaled, y_test)
print(f"logistic regression model accuracy: {score}")

logistic regression model accuracy: 0.6192918192918193


In [27]:
# Define the random forest model
rfc = RandomForestClassifier(max_depth=2, random_state=0)

In [28]:
# Train the model
rfc.fit(X_train_scaled,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [29]:
# Predict for one observation
y_predict = rfc.predict(X_test_scaled)
print(f" random forest model accuracy: {accuracy_score(y_test,y_predict):.3f}")

 random forest model accuracy: 0.580


In [30]:
# Create discrete variable to bucket videos with than 1 million views that includes the top 100 tags
model_df['More_than_10M_views'] = np.where(model_df['viewers'] > 1000000, 1, 0)
model_df

Unnamed: 0.1,Unnamed: 0,tags,viewers,has_funny_tag,has_comedy_tag,has_how to_tag,has_music_tag,has_Pop_tag,has_2018_tag,has_humor_tag,...,has_cartoon_tag,has_snl_tag,has_highlight_tag,has_politics_tag,has_lol_tag,has_Music_tag,has_laugh_tag,More_than_1M_views,More_than_5M_views,More_than_10M_views
0,0,SHANtell martin,748374,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
1,1,last week tonight trump presidency|last week t...,2418783,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
2,2,racist superman|rudy|mancuso|king|bach|racist|...,3191434,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,1,1
3,3,rhett and link|gmm|good mythical morning|rhett...,343168,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,4,ryan|higa|higatv|nigahiga|i dare you|idy|rhpc|...,2095731,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,40944,aarons animals|aarons|animals|cat|cats|kitten|...,1685609,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
40945,40945,[none],1064798,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
40946,40946,I gave safiya nygaard a perfect hair makeover ...,1066451,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
40947,40947,Black Panther|HISHE|Marvel|Infinity War|How It...,5660813,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [31]:
# Split the data into X and y
y = model_df['More_than_10M_views']
X = model_df.drop(['viewers', 'tags', 'More_than_1M_views', 'More_than_5M_views','More_than_10M_views','Unnamed: 0'], axis = 1)
X = sm.add_constant(X)

In [32]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

In [33]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# # Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [34]:
# Define the logistic regression model
log_classifier = LogisticRegression()

In [35]:
# Train the model
log_classifier.fit(X_train_scaled,y_train)

LogisticRegression()

In [36]:
# Returns a NumPy Array
# Predict for One Observation
log_classifier.predict(X_test_scaled)

array([1, 0, 0, ..., 0, 0, 1])

In [37]:
# Use score method to get accuracy of model
score = log_classifier.score(X_test_scaled, y_test)
print(f"logistic regression model accuracy: {score}")

logistic regression model accuracy: 0.6649572649572649


In [38]:
# Define the random forest model
rfc = RandomForestClassifier(max_depth=2, random_state=0)

In [39]:
# Train the model
rfc.fit(X_train_scaled,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [40]:
# Predict for one observation
y_predict = rfc.predict(X_test_scaled)
print(f" random forest model accuracy: {accuracy_score(y_test,y_predict):.3f}")

 random forest model accuracy: 0.608
