# Imports

In [1]:
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt
import nltk
import spacy
import re
import model
import env
import os

from importlib import reload
from itertools import product
from math import sqrt
from scipy import stats
from scipy.stats import pearsonr, spearmanr
from statsmodels.formula.api import ols
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, precision_recall_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE, SelectKBest, f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


## Acquire data

In [2]:
#reading the list of prepared pickle file contained in the csv file
df = pd.read_pickle('prepared.pkl')

## Model

After analyzing the Top 200 trending Youtube videos, Exploration phase identified arrays of possible drivers of Top 25 trending Youtube videos. In this section, we will create a machine learning algorithm model that better and accurately predicts Top 25 trending Youtube videos and to use our takeaways and recommendations with an eye towards enabling smaller creators to produce in the mode of the top 25.

Top_25 is our target variable

We will use our training data to train/fit to our model and then tune the model on our validate data.

We will pick our best model on accuracy.

Four supervised machine learning classifications models were created in this project:

 - Decision Tree
 - Randon Forest
 - K-Nearest Neighbor
 - Logistic Regression 

## Split Data
 - Split data into 3 samples of train (60%), validate(20%) and test(20%)
 - Our target variable is Top_25

In [3]:
train, validate, test = model.my_train_test_split(df, 'top_25')

In [4]:
train.shape,validate.shape,test.shape

((5721, 32), (1907, 32), (1908, 32))

## Scale Data

In [5]:
from sklearn.preprocessing import MinMaxScaler

In [6]:
#selected features to scale into train, validate & test
features_to_scale = ['age','num_of_tags','duration','num_of_tags','engagement','sponsored', 'title_in_description', 
        'title_in_tags','pct_tags_in_description', 'title_lengths', 'desc_lengths','tags_length']

In [7]:
train_scaled, validate_scaled, test_scaled = model.scale_data(train, validate, test, features_to_scale)

## Feature Selection
- Select K Best
- Uses statistical tests to determine each feature's usefulness in predicting the target variable.
- Ranks the features and then select the K best features.

In [8]:
#selected features to scale into train, validate & test
features_to_scale = ['age','num_of_tags','duration','num_of_tags','engagement','sponsored', 'title_in_description', 
        'title_in_tags','pct_tags_in_description', 'title_lengths', 'desc_lengths','tags_length']
train_scaled, validate_scaled, test_scaled = model.scale_data(train, validate, test, features_to_scale)

In [9]:
#X will be features
#y will be our target variable
#these features have high correlation to top_25 videos
X_train, y_train, X_validate, y_validate, X_test, y_test = model.getting_(train_scaled,validate_scaled,test_scaled)

## Baseline Prediction and Accuracy
- Baseline prediction is a benchmark. It predicts the most prevelant class in the train data. We compare our model and want it to be better than the baseline prediction.

In [10]:
# look at values of target variable top_25
# baseline prediction: the most prevalent class in training dataset(the mode)
y_train.value_counts()

0    4986
1     735
Name: top_25, dtype: int64

In [11]:
baseline_accuracy = (y_train == 0).mean()
print('Top_25 videos baseline accuracy is:', baseline_accuracy)

Top_25 videos baseline accuracy is: 0.8715259570005244


## Model on Train

## Decision Tree Classifier

In [12]:
dtc_scores = model.run_decision_tree_models(X_train, y_train, X_validate, y_validate)

In [13]:
dtc_scores

Unnamed: 0,i,accuracy_train,accuracy_validate,difference
0,1,0.871526,0.871526,0.0
1,2,0.889879,0.883062,0.006817
2,3,0.891802,0.889879,0.001923
3,4,0.906485,0.896696,0.009788
4,5,0.916973,0.909806,0.007167
5,6,0.933228,0.918721,0.014508
6,7,0.946163,0.930781,0.015382
7,8,0.956301,0.930781,0.02552
8,9,0.969761,0.939696,0.030065
9,10,0.97885,0.937598,0.041252


In [14]:
#model with maximun sample leaf 14 @ 94% accuracy on validate is the best

## Random Forest

In [15]:
from importlib import reload

In [16]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [17]:
rf_scores = model.run_random_forest_models(X_train, y_train, X_validate, y_validate)

In [18]:
rf_scores

Unnamed: 0,max_depth,accuracy_train,accuracy_validate,difference
0,1,0.871526,0.871526,0.0
1,2,0.871526,0.871526,0.0
2,3,0.8724,0.871526,0.000874
3,4,0.877294,0.87677,0.000524
4,5,0.898969,0.889355,0.009614
5,6,0.91505,0.897745,0.017305
6,7,0.928509,0.905087,0.023422
7,8,0.938298,0.909282,0.029016
8,9,0.949135,0.915574,0.033561
9,10,0.961545,0.923964,0.037581


In [20]:
#model with maximun sample leaf 11 @ 93% accuracy on validate is the best

## KNeighborsClassifier

In [21]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [22]:
knn_scores = model.run_kneighbors_models(X_train, y_train, X_validate, y_validate)

In [23]:
knn_scores

Unnamed: 0,k,accuracy_train,accuracy_validate,difference
0,1,0.99965,0.930257,0.069393
1,2,0.967313,0.90194,0.065373
2,3,0.957874,0.878867,0.079007
3,4,0.931481,0.888831,0.04265
4,5,0.923615,0.879392,0.044223
5,6,0.911729,0.878343,0.033386
6,7,0.908058,0.869428,0.03863
7,8,0.897396,0.874148,0.023248
8,9,0.894948,0.86838,0.026569


In [25]:
#KNN model with minimun sample leaf 1 @ 93% accuracy on validate is the best

## Logistic Regression

In [26]:
reload(model)

<module 'model' from '/Users/patricknash/codeup-data-science/top_200/model.py'>

In [27]:
lr_scores = model.run_logistic_reg_models(X_train, y_train, X_validate, y_validate)

In [28]:
lr_scores

{'accuracy_train': 0.8711763677678728,
 'accuracy_validate': 0.8715259570005244,
 'difference': -0.0003495892326516037}

In [29]:
#Logistic Regression has 87% accuracy on validate

## Test Model

In [30]:
def run_on_test(X_train, y_train, X_test, y_test):
    #create, fit, use, model information to model_features dfram
    model = DecisionTreeClassifier(max_depth=14, random_state=123)
    #features to be used

    scaled_features = ['age_scaled', 'num_of_tags_scaled','duration_scaled', 'num_of_tags_scaled',
           'engagement_scaled', 'sponsored_scaled', 'title_in_description', 'title_in_tags',
           'pct_tags_in_description', 'title_lengths', 'desc_lengths',
        'tags_length']
    #fit model
    model.fit(X_train, y_train)
    #score model to add to model description dataframe
    score = model.score(X_test, y_test).round(3)
    
    return score

In [31]:
run_on_test(X_train, y_train, X_test, y_test)

0.941

## Test Takeaways

- Decision Tree Classifier model performed best on validate data with 94% accuracy, so we will use it on test data

## Modeling Takeaways

- All models had roughly almost the same accuracy for train and validate sets.
- Logistic Regression model performed the worst on out-of-sample data.
- The best performing model is Decision Tree Classifier.
- However, from the above model analysis we can see that the highest accuracy is achieved by Decision Tree. It performs better than baseline by about 7%.

- While this is an improvement there is still room for improvement in future iterations