### Import our dependencies

In [1]:
# Import our dependencies
import pandas as pd
import matplotlib as plt
from sklearn.datasets import make_blobs
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import numpy as np
import os
import seaborn as sns
from matplotlib import pyplot as plt
from matplotlib import ticker
import matplotlib.colors as mc
import statsmodels.formula.api as sm
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm

### Import and read csv files

In [2]:
model_df = pd.read_csv('model_df.csv')

### Create a discrete variable into a bucket

In [3]:
# Create discrete variable to bucket videos with than 1 million views that includes the top 100 tags
model_df['More_than_1M_views'] = np.where(model_df['views'] > 100000, 1, 0)
model_df

Unnamed: 0.1,Unnamed: 0,tags,views,has_funny_tag,has_comedy_tag,has_how to_tag,has_Pop_tag,has_2018_tag,has_music_tag,has_humor_tag,...,has_laugh_tag,has_pop_tag,has_cook_tag,has_song_tag,has_unboxing_tag,has_pets_tag,has_dance_tag,has_talk_tag,has_Music_tag,More_than_1M_views
0,0,SHANtell martin,748374,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,"last week tonight trump presidency|""last week ...",2418783,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,2,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
3,3,"rhett and link|""gmm""|""good mythical morning""|""...",343168,1,1,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
4,4,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40944,40944,"aarons animals|""aarons""|""animals""|""cat""|""cats""...",1685609,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40945,40945,[none],1064798,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40946,40946,I gave safiya nygaard a perfect hair makeover ...,1066451,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
40947,40947,"Black Panther|""HISHE""|""Marvel""|""Infinity War""|...",5660813,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [4]:
# Split the data into X and y
y = model_df['More_than_1M_views']
X = model_df.drop(['views', 'tags', 'More_than_1M_views', 'Unnamed: 0'], axis = 1)
X = sm.add_constant(X)

In [5]:
X.sum()

const               40949.0
has_funny_tag        4711.0
has_comedy_tag       3310.0
has_how to_tag       3149.0
has_Pop_tag          1447.0
                     ...   
has_unboxing_tag      411.0
has_pets_tag          433.0
has_dance_tag         748.0
has_talk_tag         1263.0
has_Music_tag        1419.0
Length: 101, dtype: float64

In [6]:
y.sum()

35619

In [7]:
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

### Print Logistic Regression Results

In [8]:
logit_model = sm.Logit(y, X)
logit_res = logit_model.fit()
logit_res.summary()

Optimization terminated successfully.
         Current function value: 0.356736
         Iterations 9


0,1,2,3
Dep. Variable:,More_than_1M_views,No. Observations:,40949.0
Model:,Logit,Df Residuals:,40848.0
Method:,MLE,Df Model:,100.0
Date:,"Sat, 16 Jan 2021",Pseudo R-squ.:,0.07747
Time:,19:55:10,Log-Likelihood:,-14608.0
converged:,True,LL-Null:,-15835.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,1.6530,0.022,73.934,0.000,1.609,1.697
has_funny_tag,0.4597,0.147,3.118,0.002,0.171,0.749
has_comedy_tag,-0.0135,0.093,-0.144,0.885,-0.196,0.169
has_how to_tag,-0.0646,0.072,-0.894,0.371,-0.206,0.077
has_Pop_tag,0.6162,0.111,5.560,0.000,0.399,0.833
has_2018_tag,0.5567,0.064,8.752,0.000,0.432,0.681
has_music_tag,0.2143,0.085,2.528,0.011,0.048,0.380
has_humor_tag,0.3348,0.202,1.660,0.097,-0.060,0.730
has_food_tag,0.8081,0.115,7.051,0.000,0.583,1.033


### Find tags that has a significant p value

In [9]:
significant_tags = []
for tag, pvalue in dict(logit_res.params).items():
    if pvalue < 0.05:
        print(tag)
        significant_tags.append(tag)

has_comedy_tag
has_how to_tag
has_news_tag
has_celebrity_tag
has_tutorial_tag
has_live_tag
has_interview_tag
has_video_tag
has_cooking_tag
has_celebrities_tag
has_cute_tag
has_Comedy_tag
has_hollywood_tag
has_NBC_tag
has_2017_tag
has_sports_tag
has_jokes_tag
has_recipe_tag
has_sketch_tag
has_documentary_tag
has_technology_tag
has_show_tag
has_family_tag
has_hilarious_tag
has_funny videos_tag
has_games_tag
has_drama_tag
has_film_tag
has_christmas_tag
has_kitchen_tag
has_joke_tag
has_love_tag
has_vlogger_tag
has_CBS_tag
has_host_tag
has_parody_tag
has_highlight_tag
has_politics_tag
has_lol_tag
has_laugh_tag
has_pop_tag
has_pets_tag
has_talk_tag


In [10]:
# Find number of significant tags
len(significant_tags)

43

In [11]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# # Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
#Define the logistic regression model
log_classifier = LogisticRegression()

In [13]:
# Train the model
log_classifier.fit(X_train_scaled,y_train)

LogisticRegression()

In [14]:
# Returns a NumPy Array
# Predict for One Observation
log_classifier.predict(X_test_scaled)

array([1, 1, 1, ..., 1, 1, 1])

In [15]:
# Use score method to get accuracy of model
score = log_classifier.score(X_test_scaled, y_test)
print(f"logistic regression model accuracy: {score}")

logistic regression model accuracy: 0.873015873015873


In [16]:
#Define the random forest model
rfc = RandomForestClassifier(max_depth=2, random_state=0)

In [17]:
# Train the model
rfc.fit(X_train_scaled,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [18]:
# Predict for one observation
y_predict = rfc.predict(X_test_scaled)
print(f" random forest model accuracy: {accuracy_score(y_test,y_predict):.3f}") 

 random forest model accuracy: 0.872
