# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import spacy
import re
import model
import env
import os

from importlib import reload
from itertools import product
from math import sqrt
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


## Acquire data

In [2]:
#reading the list of prepared pickle file contained in the csv file
df = pd.read_pickle('prepared.pkl')

## Model

After analyzing the Top 200 trending Youtube videos, Exploration phase identified arrays of possible drivers of Top 25 trending Youtube videos. In this section, we will create a machine learning algorithm model that better and accurately predicts Top 25 trending Youtube videos and to use our takeaways and recommendations with an eye towards enabling smaller creators to produce in the mode of the top 25.

Top_25 is our target variable

We will use our training data to train/fit to our model and then tune the model on our validate data.

We will pick our best model on accuracy.

Four supervised machine learning classifications models were created in this project:

 - Decision Tree
 - Randon Forest
 - K-Nearest Neighbor
 - Logistic Regression 

## Split Data
 - Split data into 3 samples of train (60%), validate(20%) and test(20%)
 - Our target variable is Top_25

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   video_id                 2019 non-null   object             
 1   title                    2019 non-null   object             
 2   publishedAt              2019 non-null   datetime64[ns, UTC]
 3   channelTitle             2019 non-null   object             
 4   categoryId               2019 non-null   object             
 5   trending_date            2019 non-null   datetime64[ns, UTC]
 6   tags                     2019 non-null   object             
 7   view_count               2019 non-null   int64              
 8   likes                    2019 non-null   int64              
 9   comment_count            2019 non-null   int64              
 10  thumbnail_link           2019 non-null   object             
 11  comments_disabled        2019 

In [4]:
dummy_df = pd.get_dummies(df[['categoryId','region']], drop_first=True)

In [5]:
df = pd.concat([df, dummy_df], axis=1)

In [6]:
df_copy = df.copy()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 61 columns):
 #   Column                          Non-Null Count  Dtype              
---  ------                          --------------  -----              
 0   video_id                        2019 non-null   object             
 1   title                           2019 non-null   object             
 2   publishedAt                     2019 non-null   datetime64[ns, UTC]
 3   channelTitle                    2019 non-null   object             
 4   categoryId                      2019 non-null   object             
 5   trending_date                   2019 non-null   datetime64[ns, UTC]
 6   tags                            2019 non-null   object             
 7   view_count                      2019 non-null   int64              
 8   likes                           2019 non-null   int64              
 9   comment_count                   2019 non-null   int64              
 10  thumbnail_li

In [8]:
df.drop(columns=['video_id','title','publishedAt','categoryId','trending_date','tags','channelTitle',
                           'thumbnail_link','comments_disabled','ratings_disabled','ratings_disabled',
                          'ratings_disabled','description','captions','region','channel_age', 'rank',
                          'subscribers','video_count','age','engagement','sponsored','word_bank',
                           'cleaned_tags','cleaned_desc','title_in_description','title_in_tags'], axis=1, inplace=True)

df

Unnamed: 0,view_count,likes,comment_count,duration,top_25,num_of_tags,pct_tags_in_description,title_lengths,desc_lengths,tags_length,...,region_CA,region_DE,region_FR,region_GB,region_IND,region_JP,region_KR,region_MX,region_RU,region_US
0,1098919,19090,861,1281,1,67,0.013889,79,200,325,...,0,0,0,0,0,0,1,0,0,0
1,2217807,182434,7282,4327,1,15,0.117647,81,2613,96,...,1,0,0,0,0,0,0,0,0,0
2,2258144,44366,938,24001,1,28,0.289474,62,1874,444,...,0,1,0,0,0,0,0,0,0,0
3,3262953,312903,14437,1862,1,32,0.447368,63,1132,446,...,0,0,0,0,0,0,0,0,1,0
4,313545,4899,596,965,1,9,0.000000,63,187,65,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,1902762,78690,1712,1099,0,27,0.068966,86,1313,327,...,0,0,0,0,0,0,0,1,0,0
2015,1976426,48816,3060,1869,0,1,1.000000,28,1517,6,...,0,0,0,0,0,1,0,0,0,0
2016,1529830,114260,11625,166,0,1,0.000000,64,352,0,...,0,0,0,0,0,0,0,0,0,0
2017,1835677,12011,1015,1191,0,1,0.000000,29,54,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   view_count                      2019 non-null   int64  
 1   likes                           2019 non-null   int64  
 2   comment_count                   2019 non-null   int64  
 3   duration                        2019 non-null   int64  
 4   top_25                          2019 non-null   int64  
 5   num_of_tags                     2019 non-null   int64  
 6   pct_tags_in_description         2019 non-null   float64
 7   title_lengths                   2019 non-null   int64  
 8   desc_lengths                    2019 non-null   int64  
 9   tags_length                     2019 non-null   int64  
 10  content_rate                    2019 non-null   float64
 11  views_per_sub                   2019 non-null   float64
 12  categoryId_Comedy               20

In [10]:
train, validate, test = model.my_train_test_split(df, 'top_25')

In [11]:
train.shape,validate.shape,test.shape

((1211, 36), (404, 36), (404, 36))

## Scale Data

In [12]:
#dummy_df = pd.get_dummies(df[['categoryId','region']],drop_first=True)
#dummy_df.head()

In [13]:
#pd.get_dummies(df, prefix=['col1', 'col2'])

In [14]:
#pd.get_dummies(df, prefix=['categoryId', 'region'])

In [41]:
train.info(0)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1211 entries, 231 to 544
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   view_count                      1211 non-null   int64  
 1   likes                           1211 non-null   int64  
 2   comment_count                   1211 non-null   int64  
 3   duration                        1211 non-null   int64  
 4   top_25                          1211 non-null   int64  
 5   num_of_tags                     1211 non-null   int64  
 6   pct_tags_in_description         1211 non-null   float64
 7   title_lengths                   1211 non-null   int64  
 8   desc_lengths                    1211 non-null   int64  
 9   tags_length                     1211 non-null   int64  
 10  content_rate                    1211 non-null   float64
 11  views_per_sub                   1211 non-null   float64
 12  categoryId_Comedy               1

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2019 entries, 0 to 2018
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   view_count                      2019 non-null   int64  
 1   likes                           2019 non-null   int64  
 2   comment_count                   2019 non-null   int64  
 3   duration                        2019 non-null   int64  
 4   top_25                          2019 non-null   int64  
 5   num_of_tags                     2019 non-null   int64  
 6   pct_tags_in_description         2019 non-null   float64
 7   title_lengths                   2019 non-null   int64  
 8   desc_lengths                    2019 non-null   int64  
 9   tags_length                     2019 non-null   int64  
 10  content_rate                    2019 non-null   float64
 11  views_per_sub                   2019 non-null   float64
 12  categoryId_Comedy               20

In [16]:
from sklearn.preprocessing import MinMaxScaler

In [17]:
#selected features to scale into train, validate & test
features_to_scale = ['view_count', 'likes', 'comment_count', 'duration', 'num_of_tags', 'pct_tags_in_description',\
'title_lengths','desc_lengths','tags_length','content_rate', 'views_per_sub']

In [18]:
train_scaled, validate_scaled, test_scaled = model.scale_data(train, validate, test, features_to_scale)

## Feature Selection
- Selected 16 best
- Uses statistical tests to determine each feature's usefulness in predicting the target variable.
- Ranks the features based of highest correlation to engagement.

In [19]:
#X will be features
#y will be our target variable
#these features have high correlation to top_25 videos
X_train, y_train, X_validate, y_validate, X_test, y_test = model.getting_(train_scaled,validate_scaled,test_scaled)

## Baseline Prediction and Accuracy
- Baseline prediction is a benchmark. It predicts the most prevelant class in the train data. We compare our model and want it to be better than the baseline prediction.

In [20]:
# look at values of target variable top_25
# baseline prediction: the most prevalent class in training dataset(the mode)
y_train.value_counts()

0    987
1    224
Name: top_25, dtype: int64

In [21]:
#Formulate baseline accuracy
baseline_accuracy = (y_train == 0).mean()
print('Top_25 videos baseline accuracy is:', baseline_accuracy)

Top_25 videos baseline accuracy is: 0.815028901734104


## Model on Train

## Decision Tree Classifier

In [22]:
dtc_scores = model.run_decision_tree_models(X_train, y_train, X_validate, y_validate)

In [23]:
dtc_scores

Unnamed: 0,i,accuracy_train,accuracy_validate,difference
0,1,0.815029,0.814356,0.000672
1,2,0.815855,0.811881,0.003973
2,3,0.819158,0.809406,0.009752
3,4,0.830718,0.794554,0.036164
4,5,0.842279,0.80198,0.040299
5,6,0.857143,0.787129,0.070014
6,7,0.8654,0.774752,0.090648
7,8,0.888522,0.740099,0.148423
8,9,0.905863,0.75495,0.150912
9,10,0.919901,0.727723,0.192178


In [24]:
#model with maximun sample leaf 6 @ 85% accuracy on validate is the best

## Random Forest

In [25]:
from importlib import reload

In [26]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [27]:
rf_scores = model.run_random_forest_models(X_train, y_train, X_validate, y_validate)

In [28]:
rf_scores

Unnamed: 0,max_depth,accuracy_train,accuracy_validate,difference
0,5,0.819983,0.816832,0.003152
1,6,0.820809,0.816832,0.003978
2,7,0.827415,0.816832,0.010584
3,8,0.827415,0.816832,0.010584
4,9,0.849711,0.814356,0.035355
5,10,0.867878,0.816832,0.051046
6,11,0.890999,0.816832,0.074167
7,12,0.904211,0.816832,0.08738
8,13,0.925681,0.816832,0.10885
9,14,0.938068,0.816832,0.121236


In [29]:
#model with maximun sample leaf 10 @ 84% accuracy on validate is the best

## KNeighborsClassifier

In [30]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [31]:
knn_scores = model.run_kneighbors_models(X_train, y_train, X_validate, y_validate)

In [32]:
knn_scores

Unnamed: 0,k,accuracy_train,accuracy_validate,difference
0,1,1.0,0.695545,0.304455
1,2,0.853014,0.789604,0.06341
2,3,0.852188,0.769802,0.082386
3,4,0.831544,0.799505,0.032039
4,5,0.830718,0.79703,0.033689
5,6,0.818332,0.806931,0.011401
6,7,0.81668,0.804455,0.012225
7,8,0.818332,0.811881,0.006451
8,9,0.819983,0.804455,0.015528


In [33]:
#KNN model with minimun sample leaf 9 @ 80% accuracy on validate is the best

## Logistic Regression

In [34]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [35]:
lr_scores = model.run_logistic_reg_models(X_train, y_train, X_validate, y_validate)

In [36]:
lr_scores

{'accuracy_train': 0.8142031379025598,
 'accuracy_validate': 0.8118811881188119,
 'difference': 0.0023219497837478897}

In [37]:
#Logistic Regression has 81% accuracy on validate

## Test Model

In [38]:
def run__on_test(X_train, y_train, X_test, y_test):
    #create, fit, use, model information to model_features dfram
    model = DecisionTreeClassifier(max_depth=6, random_state=123)
    #features to be used

    scaled_features = ['age','duration','num_of_tags','engagement','sponsored', 'title_in_description', 
        'title_in_tags','pct_tags_in_description', 'title_lengths', 'desc_lengths','tags_length',
                     'channel_age','subscribers','video_count','content_rate','views_per_sub']
    #fit model
    model.fit(X_train, y_train)
    #score model to add to model description dataframe
    score = model.score(X_test, y_test).round(3)
    
    return score

In [39]:
run__on_test(X_train, y_train, X_test, y_test)

0.772

In [40]:
model.run__on_test(X_train, y_train, X_test, y_test)

0.772

## Test Takeaways

- Decision Tree Classifier model performed best on validate data with 94% accuracy, so we will use it on test data

## Modeling Takeaways

- All models had roughly almost the same accuracy for train and validate sets.
- Logistic Regression model performed the worst on out-of-sample data.
- The best performing model is Decision Tree Classifier.
- However, from the above model analysis we can see that the highest accuracy is achieved by Decision Tree. It performs better than baseline by about 7%.

- While this is an improvement there is still room for improvement in future iterations