# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import spacy
import re
import model
import env
import os

from importlib import reload
from itertools import product
from math import sqrt
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


## Acquire data

In [2]:
#reading the list of prepared pickle file contained in the csv file
df = pd.read_pickle('prepared.pkl')

## Model

After analyzing the Top 200 trending Youtube videos, Exploration phase identified arrays of possible drivers of Top 25 trending Youtube videos. In this section, we will create a machine learning algorithm model that better and accurately predicts Top 25 trending Youtube videos and to use our takeaways and recommendations with an eye towards enabling smaller creators to produce in the mode of the top 25.

Top_25 is our target variable

We will use our training data to train/fit to our model and then tune the model on our validate data.

We will pick our best model on accuracy.

Four supervised machine learning classifications models were created in this project:

 - Decision Tree
 - Randon Forest
 - K-Nearest Neighbor
 - Logistic Regression 

## Split Data
 - Split data into 3 samples of train (60%), validate(20%) and test(20%)
 - Our target variable is Top_25

In [3]:
dummy_df = pd.get_dummies(df[['categoryId','region']], drop_first=True)

In [4]:
df = pd.concat([df, dummy_df], axis=1)

In [5]:
df_copy = df.copy()

In [6]:
df = df.drop(columns=['categoryId','video_id','title','publishedAt','region','trending_date','tags','channelTitle',
                'thumbnail_link','comments_disabled','ratings_disabled','ratings_disabled',
                'ratings_disabled','description','captions','channel_age', 'rank',
                'word_bank','cleaned_tags','cleaned_desc','title_in_description','title_in_tags'],\
                axis=1)

df

Unnamed: 0,view_count,likes,comment_count,duration,top_25,subscribers,video_count,age,engagement,sponsored,...,region_CA,region_DE,region_FR,region_GB,region_IND,region_JP,region_KR,region_MX,region_RU,region_US
0,1098919,19090,861,1281,1,2210000,3168,82.554444,0.020506,0,...,0,0,0,0,0,0,1,0,0,0
1,2217807,182434,7282,4327,1,7180000,311,29.206667,0.095392,0,...,1,0,0,0,0,0,0,0,0,0
2,2258144,44366,938,24001,1,19200000,603,15.998889,0.021309,0,...,0,1,0,0,0,0,0,0,0,0
3,3262953,312903,14437,1862,1,15000000,408,27.088611,0.113594,0,...,0,0,0,0,0,0,0,0,1,0
4,313545,4899,596,965,1,795000,2211,3.805000,0.023228,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2014,1902762,78690,1712,1099,0,12400000,1270,270.941944,0.044955,0,...,0,0,0,0,0,0,0,1,0,0
2015,1976426,48816,3060,1869,0,4630000,1829,228.996944,0.030892,0,...,0,0,0,0,0,1,0,0,0,0
2016,1529830,114260,11625,166,0,7720000,206,285.015833,0.105084,0,...,0,0,0,0,0,0,0,0,0,0
2017,1835677,12011,1015,1191,0,321000,81,566.429167,0.008755,0,...,0,0,0,0,0,0,1,0,0,0


In [7]:
train, validate, test = model.my_train_test_split(df, 'top_25')

In [8]:
train.shape,validate.shape,test.shape

((1211, 41), (404, 41), (404, 41))

## Scale Data

In [9]:
#selected features to scale into train, validate & test
features_to_scale = ['view_count', 'likes', 'comment_count', 'duration',\
            'num_of_tags', 'pct_tags_in_description',\
            'title_lengths','desc_lengths','tags_length',\
            'content_rate', 'views_per_sub',\
            'subscribers','video_count','age','engagement','sponsored']

In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
train_scaled, validate_scaled, test_scaled = model.scale_data(train, validate, test, features_to_scale)

## Feature Selection
- Select best
- Uses statistical tests to determine each feature's usefulness in predicting the target variable.
- Ranks the features based of highest correlation to engagement.

In [12]:
#X will be features
#y will be our target variable
#these features have high correlation to top_25 videos
X_train, y_train, X_validate, y_validate, X_test, y_test = model.getting_(train_scaled,validate_scaled,test_scaled)

## Baseline Prediction and Accuracy
- Baseline prediction is a benchmark. It predicts the most prevelant class in the train data. We compare our model and want it to be better than the baseline prediction.

In [13]:
# look at values of target variable top_25
# baseline prediction: the most prevalent class in training dataset(the mode)
y_train.value_counts()

0    987
1    224
Name: top_25, dtype: int64

In [14]:
#Formulate baseline accuracy
baseline_accuracy = (y_train == 0).mean()
print('Top_25 videos baseline accuracy is:', baseline_accuracy)

Top_25 videos baseline accuracy is: 0.815028901734104


## Model on Train

## Decision Tree Classifier

In [15]:
dtc_scores = model.run_decision_tree_models(X_train, y_train, X_validate, y_validate)

In [16]:
dtc_scores

Unnamed: 0,i,accuracy_train,accuracy_validate,difference
0,5,0.922378,0.876238,0.046141
1,6,0.943848,0.876238,0.06761
2,7,0.958712,0.873762,0.084949
3,8,0.971098,0.878713,0.092385
4,9,0.986788,0.861386,0.125402
5,10,0.995045,0.871287,0.123758
6,11,0.997523,0.856436,0.141087
7,12,1.0,0.861386,0.138614
8,13,1.0,0.861386,0.138614
9,14,1.0,0.861386,0.138614


In [17]:
dtc_scores.to_csv('dtc_scores_tab.csv')

In [18]:
#model with maximun sample leaf 8 @ 87% accuracy on validate is the best

## Random Forest

In [19]:
from importlib import reload

In [20]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [21]:
rf_scores = model.run_random_forest_models(X_train, y_train, X_validate, y_validate)

In [22]:
rf_scores

Unnamed: 0,max_depth,accuracy_train,accuracy_validate,difference
0,10,0.975227,0.876238,0.098989
1,11,0.98431,0.876238,0.108073
2,12,0.988439,0.876238,0.112202
3,13,0.988439,0.873762,0.114677
4,14,0.988439,0.883663,0.104776
5,15,0.991742,0.886139,0.105604
6,16,0.990091,0.881188,0.108903
7,17,0.992568,0.881188,0.11138
8,18,0.990091,0.881188,0.108903
9,19,0.990091,0.881188,0.108903


In [23]:
rf_scores.to_csv('rf_scores_tab.csv')

In [24]:
#model with maximun sample leaf 15 @ 88% accuracy on validate is the best

## KNeighborsClassifier

In [25]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [26]:
knn_scores = model.run_kneighbors_models(X_train, y_train, X_validate, y_validate)

In [27]:
knn_scores

Unnamed: 0,k,accuracy_train,accuracy_validate,difference
0,1,1.0,0.690594,0.309406
1,2,0.85384,0.811881,0.041959
2,3,0.853014,0.782178,0.070836
3,4,0.830718,0.809406,0.021312
4,5,0.831544,0.806931,0.024613
5,6,0.82659,0.806931,0.019659
6,7,0.824112,0.804455,0.019657
7,8,0.820809,0.809406,0.011403
8,9,0.819158,0.80198,0.017178


In [28]:
knn_scores.to_csv('knn_scores_tab.csv')

In [29]:
#KNN model with minimun sample leaf 2 @ 81% accuracy on validate is the best

## Logistic Regression

In [30]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [31]:
lr_scores = model.run_logistic_reg_models(X_train, y_train, X_validate, y_validate)

In [32]:
lr_scores

Unnamed: 0,accuracy_train,accuracy_validate,difference
0,0.814203,0.821782,-0.007579


In [37]:
lr_scores.to_csv('lr_scores_tab.csv')

In [33]:
#Logistic Regression has 82% accuracy on validate

## Test Model

In [36]:
model.run__on_test(X_train, y_train, X_test, y_test)

0.849

## Test Takeaways

- Random Forest model performed best on validate data with 88% accuracy, so we will use it on test data.
- The test does not show 88% but instead 84% because of the unseen data. 

## Modeling Takeaways

- Random Forest and Decision Tree models had roughly almost the same accuracy for train and validate sets.
- KNeighbors Classifier and Logistic Regression model performed the worst on out-of-sample data.
- The best performing model is Random Forest Classifier.
- However, from the above model analysis we can see that the highest accuracy is achieved by Random Forest. It performs better than baseline by about 7%.

- While this is an improvement there is still room for improvement in future iterations