
#use multinomial naive bayes algorithm after the other models

# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import spacy
import re

from importlib import reload
from itertools import product
from math import sqrt
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


## Acquire data

In [2]:
#reading the list of prepared pickle file contained in the csv file
df = pd.read_pickle('prepared.pkl 3')

## Model

After analyzing the Top 200 trending Youtube videos, Exploration phase identified arrays of possible drivers of Top 25 trending Youtube videos. In this section, we will create a machine learning algorithm model that better and accurately predicts Top 25 trending Youtube videos and to use our takeaways and recommendations with an eye towards enabling smaller creators to produce in the mode of the top 25.

Top_25 is our target variable

We will use our training data to train/fit to our model and then tune the model on our validate data.

We will pick our best model on accuracy.

Four supervised machine learning classifications models were created in this project:

 - Decision Tree
 - Randon Forest
 - K-Nearest Neighbor
 - Logistic Regression 

## Split Data
 - Split data into 3 samples of train (60%), validate(20%) and test(20%)
 - Our target variable is Top_25

In [3]:
def my_train_test_split(df, target):
    ''' 
    This function takes in a dataframe and splits data into 3 samples of train (60%), validate(20%) and test(20%).  
    '''
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[target])
    train, validate = train_test_split(train, test_size=.25, random_state=123, stratify=train[target])
    
    return train, validate, test

train, validate, test = my_train_test_split(df, 'top_25')

train.shape,validate.shape,test.shape

((5721, 32), (1907, 32), (1908, 32))

## Scale Data

In [4]:
from sklearn.preprocessing import MinMaxScaler

In [5]:
#Write function to scale data for zillow data
def scale_data(train, validate, test, features_to_scale):
    """Scales the 3 data splits using MinMax Scaler. 
    Takes in train, validate, and test data splits as well as a list of the features to scale. 
    Returns dataframe with scaled counterparts on as columns"""
    
    
    # Make the thing to train data only
    scaler = MinMaxScaler()
    scaler.fit(train[features_to_scale])
    
    # Fit the thing with new column names with _scaled added on
    scaled_columns = [col+"_scaled" for col in features_to_scale]
    
    # Transform the separate datasets using the scaler learned from train
    scaled_train = scaler.transform(train[features_to_scale])
    scaled_validate = scaler.transform(validate[features_to_scale])
    scaled_test = scaler.transform(test[features_to_scale])
    
    train_scaled = pd.concat([train, pd.DataFrame(scaled_train,index=train.index, columns = scaled_columns)],axis=1)
    validate_scaled = pd.concat([validate, pd.DataFrame(scaled_validate,index=validate.index, columns = scaled_columns)],axis=1)
    test_scaled = pd.concat([test, pd.DataFrame(scaled_test,index=test.index, columns = scaled_columns)],axis=1)

    return train_scaled, validate_scaled, test_scaled

## Feature Selection
- Select K Best
- Uses statistical tests to determine each feature's usefulness in predicting the target variable.
- Ranks the features and then select the K best features.

In [6]:
#selected features to scale into train, validate & test
features_to_scale = ['age','num_of_tags','duration','num_of_tags','engagement','sponsored', 'title_in_description', 
        'title_in_tags','pct_tags_in_description', 'title_lengths', 'desc_lengths','tags_length']
train_scaled, validate_scaled, test_scaled = scale_data(train, validate, test, features_to_scale)

In [7]:
train_scaled.columns

Index(['video_id', 'title', 'publishedAt', 'channelTitle', 'categoryId',
       'trending_date', 'tags', 'view_count', 'likes', 'comment_count',
       'thumbnail_link', 'comments_disabled', 'ratings_disabled',
       'description', 'duration', 'captions', 'region', 'rank', 'top_25',
       'age', 'engagement', 'sponsored', 'num_of_tags', 'word_bank',
       'cleaned_tags', 'cleaned_desc', 'title_in_description', 'title_in_tags',
       'pct_tags_in_description', 'title_lengths', 'desc_lengths',
       'tags_length', 'age_scaled', 'num_of_tags_scaled', 'duration_scaled',
       'num_of_tags_scaled', 'engagement_scaled', 'sponsored_scaled',
       'title_in_description_scaled', 'title_in_tags_scaled',
       'pct_tags_in_description_scaled', 'title_lengths_scaled',
       'desc_lengths_scaled', 'tags_length_scaled'],
      dtype='object')

In [8]:
#X will be features
#y will be our target variable
#these features have high correlation to top_25 videos

scaled_features = ['age_scaled', 'num_of_tags_scaled','duration_scaled', 'num_of_tags_scaled',
       'engagement_scaled', 'sponsored_scaled', 'title_in_description', 'title_in_tags',
       'pct_tags_in_description', 'title_lengths', 'desc_lengths',
    'tags_length']
X_train = train_scaled[scaled_features]
y_train = train_scaled.top_25
X_validate = validate_scaled[scaled_features]
y_validate = validate_scaled.top_25
X_test = test_scaled[scaled_features]
y_test= test_scaled.top_25

## Baseline Prediction and Accuracy
- Baseline prediction is a benchmark. It predicts the most prevelant class in the train data. We compare our model and want it to be better than the baseline prediction.

In [9]:
# look at values of target variable top_25
# baseline prediction: the most prevalent class in training dataset(the mode)
y_train.value_counts()

0    4986
1     735
Name: top_25, dtype: int64

In [10]:
baseline_accuracy = (y_train == 0).mean()
print('Top_25 videos baseline accuracy is:', baseline_accuracy)

Top_25 videos baseline accuracy is: 0.8715259570005244


## Model on Train

## Decision Tree Classifier

In [11]:
#loop the model with changing max depth only
model_scores = []
for i in range(1,15):
    model = DecisionTreeClassifier(max_depth=i, random_state =123)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy_train = model.score(X_train,y_train)
    accuracy_validate = model.score(X_validate,y_validate)
    difference = accuracy_train-accuracy_validate
    output = {"i":i, "accuracy_train":accuracy_train,"accuracy_validate":accuracy_validate,"difference":difference}
    model_scores.append(output)
df = pd.DataFrame(model_scores)
df

Unnamed: 0,i,accuracy_train,accuracy_validate,difference
0,1,0.871526,0.871526,0.0
1,2,0.889879,0.883062,0.006817
2,3,0.891802,0.889879,0.001923
3,4,0.906485,0.896696,0.009788
4,5,0.916973,0.909806,0.007167
5,6,0.933228,0.918721,0.014508
6,7,0.946163,0.930781,0.015382
7,8,0.956301,0.930781,0.02552
8,9,0.969761,0.939696,0.030065
9,10,0.97885,0.937598,0.041252


In [12]:
#model with maximun sample leaf 14 @ 94% accuracy on validate is the best

## Random Forest

In [13]:
#model with tuning min sample leaf only
model_scores = []

for i in range(1,15):

    model = RandomForestClassifier(min_samples_leaf= i,random_state=123)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy_train = model.score(X_train,y_train)
    accuracy_validate = model.score(X_validate,y_validate)
    difference = accuracy_train-accuracy_validate
    output = {"min_sample_leaf":i, "accuracy_train":accuracy_train,"accuracy_validate":accuracy_validate,"difference":difference}
    model_scores.append(output)
df = pd.DataFrame(model_scores)
df

Unnamed: 0,min_sample_leaf,accuracy_train,accuracy_validate,difference
0,1,0.99965,0.944415,0.055235
1,2,0.985842,0.937598,0.048243
2,3,0.977451,0.931306,0.046146
3,4,0.971159,0.930257,0.040902
4,5,0.965391,0.928684,0.036707
5,6,0.959622,0.924489,0.035134
6,7,0.957001,0.921867,0.035134
7,8,0.950358,0.920818,0.02954
8,9,0.947736,0.918196,0.02954
9,10,0.945639,0.919245,0.026394


In [14]:
# Random Forest model with minimun sample leaf 1 @ 94% accuracy on validate is the best
# need a high range to get the best, so let's try max!

In [15]:
model_scores = []

for i in range(1,12):

    model = RandomForestClassifier(max_depth = i,random_state=123)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_train)
    accuracy_train = model.score(X_train,y_train)
    accuracy_validate = model.score(X_validate,y_validate)
    difference = accuracy_train-accuracy_validate
    output = {"max_depth":i, "accuracy_train":accuracy_train,"accuracy_validate":accuracy_validate,"difference":difference}
    model_scores.append(output)
df = pd.DataFrame(model_scores)
df

Unnamed: 0,max_depth,accuracy_train,accuracy_validate,difference
0,1,0.871526,0.871526,0.0
1,2,0.871526,0.871526,0.0
2,3,0.8724,0.871526,0.000874
3,4,0.877294,0.87677,0.000524
4,5,0.898969,0.889355,0.009614
5,6,0.91505,0.897745,0.017305
6,7,0.928509,0.905087,0.023422
7,8,0.938298,0.909282,0.029016
8,9,0.949135,0.915574,0.033561
9,10,0.961545,0.923964,0.037581


In [16]:
#model with maximun sample leaf 11 @ 93% accuracy on validate is the best

## KNeighborsClassifier

In [17]:
#For loop for KNN 
empty_model = []
for k in range(1,10):
    model = KNeighborsClassifier(n_neighbors = k, weights = "uniform")
    model=model.fit(X_train,y_train)
    y_pred = model.predict(X_train)
    accuracy_train = model.score(X_train,y_train)
    accuracy_validate = model.score(X_validate,y_validate)
    difference = accuracy_train-accuracy_validate
    output = {"k":k, "accuracy_train":accuracy_train,"accuracy_validate":accuracy_validate,"difference":difference}
    
    
    empty_model.append(output)

df = pd.DataFrame(empty_model)
df

Unnamed: 0,k,accuracy_train,accuracy_validate,difference
0,1,0.99965,0.930257,0.069393
1,2,0.967313,0.90194,0.065373
2,3,0.957874,0.878867,0.079007
3,4,0.931481,0.888831,0.04265
4,5,0.923615,0.879392,0.044223
5,6,0.911729,0.878343,0.033386
6,7,0.908058,0.869428,0.03863
7,8,0.897396,0.874148,0.023248
8,9,0.894948,0.86838,0.026569


In [18]:
#KNN model with minimun sample leaf 1 @ 93% accuracy on validate is the best

## Logistic Regression

In [19]:
model = LogisticRegression(C = .1, random_state=123)
model=model.fit(X_train,y_train)
y_pred = model.predict(X_train)
accuracy_train = model.score(X_train,y_train)
accuracy_validate = model.score(X_validate,y_validate)
difference = accuracy_train-accuracy_validate
output = { "accuracy_train":accuracy_train,"accuracy_validate":accuracy_validate,"difference":difference}
    
output  

{'accuracy_train': 0.8711763677678728,
 'accuracy_validate': 0.8715259570005244,
 'difference': -0.0003495892326516037}

In [20]:
#Logistic Regression has 87% accuracy on validate

## Test Model

In [21]:
#create, fit, use, model information to model_features dfram
model = DecisionTreeClassifier(max_depth=14, random_state=123)
#features to be used

scaled_features = ['age_scaled', 'num_of_tags_scaled','duration_scaled', 'num_of_tags_scaled',
       'engagement_scaled', 'sponsored_scaled', 'title_in_description', 'title_in_tags',
       'pct_tags_in_description', 'title_lengths', 'desc_lengths',
    'tags_length']
#fit model
model.fit(X_train, y_train)
#score model to add to model description dataframe
score = model.score(X_test, y_test).round(3)

In [22]:
print (score)

0.941


## Test Takeaways

- Decision Tree Classifier model performed best on validate data with 94% accuracy, so we will use it on test data

## Modeling Takeaways

- All models had roughly almost the same accuracy for train and validate sets.
- Logistic Regression model performed the worst on out-of-sample data.
- The best performing model is Decision Tree Classifier.
- However, from the above model analysis we can see that the highest accuracy is achieved by Decision Tree. It performs better than baseline by about 7%.

- While this is an improvement there is still room for improvement in future iterations