In [28]:
import pandas as pd
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
import category_encoders as ce
from sklearn.preprocessing import QuantileTransformer,RobustScaler
import mlflow
from catboost import CatBoostRegressor
from mlflow.tracking import MlflowClient
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.model_selection import train_test_split
from lightgbm import early_stopping
from sklearn.metrics import mean_squared_error
from datetime import datetime

In [2]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("my-experiment-1")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1705288512489, experiment_id='1', last_update_time=1705288512489, lifecycle_stage='active', name='my-experiment-1', tags={}>

In [3]:
data = pd.read_csv('../data/YouTubeDataset_withChannelElapsed.csv')

In [4]:
data.head()

Unnamed: 0,index,totalviews/channelelapsedtime,channelId,videoCategoryId,channelViewCount,likes/subscriber,views/subscribers,videoCount,subscriberCount,videoId,...,comments/views,totvideos/videocount,elapsedtime,videoLikeCount,videoDislikeCount,dislikes/subscriber,totviews/totsubs,views/elapsedtime,videoPublished,VideoCommentCount
0,0,0.165199,UCdzU3DSGzyWzN2118yd9X9g,22,14654,0.555556,95.111111,30,18,--DwgB78t-c,...,0.0,488.466667,50040,10,1,0.055556,814.111111,0.034213,2012-01-19T18:38:28.000Z,0
1,1,1.13382,UC0UnhAG47DRyVZGVcbhAXhQ,10,105909,0.23913,59.326087,51,184,--NZRkXBV7k,...,0.000183,2076.647059,22080,44,3,0.016304,575.592391,0.494384,2015-03-30T04:04:40.000Z,2
2,2,0.66812,UCXjtAvK5P3wXBGh0vbGylzg,27,48265,0.023669,10.289941,72,338,--hoQ2sGG4M,...,0.000575,670.347222,71544,8,1,0.002959,142.795858,0.048613,2009-08-07T06:51:10.000Z,2
3,3,25.653505,UCeKHMeUlcLNPLCLUfZUQI2w,26,2116722,0.007301,0.884178,172,22051,--sBoaqBlzA,...,0.000513,12306.523256,54096,161,6,0.000272,95.992109,0.360415,2011-08-04T01:07:38.000Z,10
4,4,52.773778,UCNWPDyaWf2eAHnofFLSnEMg,20,1649075,0.004545,10.004545,2777,220,--7h1S4neDM,...,0.0,593.833273,30120,1,0,0.0,7495.795455,0.073074,2014-04-29T15:44:44.000Z,0


In [5]:
data.columns

Index(['index', 'totalviews/channelelapsedtime', 'channelId',
       'videoCategoryId', 'channelViewCount', 'likes/subscriber',
       'views/subscribers', 'videoCount', 'subscriberCount', 'videoId',
       'dislikes/views', 'channelelapsedtime', 'comments/subscriber',
       'likes/views', 'channelCommentCount', 'videoViewCount',
       'likes/dislikes', 'comments/views', 'totvideos/videocount',
       'elapsedtime', 'videoLikeCount', 'videoDislikeCount',
       'dislikes/subscriber', 'totviews/totsubs', 'views/elapsedtime',
       'videoPublished', 'VideoCommentCount'],
      dtype='object')

I will select only the required columns for the prediction

In [6]:
selected_columns =['index','channelId','videoCategoryId', 'channelViewCount',
                    'videoCount', 'subscriberCount', 'videoId','channelelapsedtime',
                    'channelCommentCount', 'videoViewCount','elapsedtime', 'videoLikeCount', 
                    'videoDislikeCount','videoPublished', 'VideoCommentCount']

data = data[selected_columns]

data.head()

Unnamed: 0,index,channelId,videoCategoryId,channelViewCount,videoCount,subscriberCount,videoId,channelelapsedtime,channelCommentCount,videoViewCount,elapsedtime,videoLikeCount,videoDislikeCount,videoPublished,VideoCommentCount
0,0,UCdzU3DSGzyWzN2118yd9X9g,22,14654,30,18,--DwgB78t-c,88705,1,1712,50040,10,1,2012-01-19T18:38:28.000Z,0
1,1,UC0UnhAG47DRyVZGVcbhAXhQ,10,105909,51,184,--NZRkXBV7k,93409,8,10916,22080,44,3,2015-03-30T04:04:40.000Z,2
2,2,UCXjtAvK5P3wXBGh0vbGylzg,27,48265,72,338,--hoQ2sGG4M,72240,5,3478,71544,8,1,2009-08-07T06:51:10.000Z,2
3,3,UCeKHMeUlcLNPLCLUfZUQI2w,26,2116722,172,22051,--sBoaqBlzA,82512,74,19497,54096,161,6,2011-08-04T01:07:38.000Z,10
4,4,UCNWPDyaWf2eAHnofFLSnEMg,20,1649075,2777,220,--7h1S4neDM,31248,0,2201,30120,1,0,2014-04-29T15:44:44.000Z,0


In [7]:
data.describe()

Unnamed: 0,index,videoCategoryId,channelViewCount,videoCount,subscriberCount,channelelapsedtime,channelCommentCount,videoViewCount,elapsedtime,videoLikeCount,videoDislikeCount,VideoCommentCount
count,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0,575610.0
mean,287804.5,17.560551,48291190.0,1204.464627,80975.98,67941.305267,731.2371,58839.08,50755.786958,292.1508,18.921494,39.082997
std,166164.438562,7.904187,463082200.0,8303.754974,603339.3,20629.159278,27241.53,1230696.0,19725.560307,4507.472,434.947624,557.811895
min,0.0,1.0,0.0,0.0,0.0,888.0,0.0,-1.0,17520.0,-1.0,-1.0,-1.0
25%,143902.25,10.0,50450.0,17.0,38.0,51984.0,0.0,2204.0,34656.0,5.0,0.0,1.0
50%,287804.5,20.0,408666.0,70.0,408.0,68400.0,2.0,4807.0,48480.0,16.0,1.0,4.0
75%,431706.75,24.0,3616452.0,336.0,5319.0,85824.0,23.0,15266.0,65424.0,60.0,5.0,14.0
max,575609.0,44.0,23798170000.0,415500.0,25253110.0,108913.0,3953563.0,500893900.0,106609.0,1240473.0,244280.0,191498.0


There are some negative values in the dataset which doesn't make sense logically so I will remove those values

In [8]:
data = data[(data['videoViewCount']>=0) & 
     (data['videoLikeCount']>=0) & 
     (data['videoDislikeCount']>=0) & 
     (data['VideoCommentCount']>=0)
      ]

data.head(2)

Unnamed: 0,index,channelId,videoCategoryId,channelViewCount,videoCount,subscriberCount,videoId,channelelapsedtime,channelCommentCount,videoViewCount,elapsedtime,videoLikeCount,videoDislikeCount,videoPublished,VideoCommentCount
0,0,UCdzU3DSGzyWzN2118yd9X9g,22,14654,30,18,--DwgB78t-c,88705,1,1712,50040,10,1,2012-01-19T18:38:28.000Z,0
1,1,UC0UnhAG47DRyVZGVcbhAXhQ,10,105909,51,184,--NZRkXBV7k,93409,8,10916,22080,44,3,2015-03-30T04:04:40.000Z,2


In [9]:
data.describe()

Unnamed: 0,index,videoCategoryId,channelViewCount,videoCount,subscriberCount,channelelapsedtime,channelCommentCount,videoViewCount,elapsedtime,videoLikeCount,videoDislikeCount,VideoCommentCount
count,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0,557166.0
mean,287826.042547,17.513098,45170800.0,1178.891298,78650.37,67966.281482,738.6427,58494.47,50812.556516,298.0977,19.199553,40.055274
std,166166.140738,7.883499,414440400.0,8266.426696,570954.0,20633.702605,27646.18,1239972.0,19743.88222,4562.354,441.275605,565.917076
min,0.0,1.0,0.0,0.0,0.0,888.0,0.0,1002.0,17640.0,0.0,0.0,0.0
25%,143907.25,10.0,49344.25,17.0,38.0,51984.0,0.0,2201.0,34680.0,6.0,0.0,1.0
50%,287864.5,20.0,398178.5,69.0,399.0,68448.0,3.0,4796.0,48552.0,17.0,1.0,4.0
75%,431725.75,24.0,3543761.0,328.0,5230.0,85848.0,23.0,15222.0,65520.0,62.0,5.0,14.0
max,575609.0,44.0,23798170000.0,365544.0,25253110.0,108913.0,3953563.0,500893900.0,106609.0,1240473.0,244280.0,191498.0


In [10]:
data.isnull().sum()

index                  0
channelId              0
videoCategoryId        0
channelViewCount       0
videoCount             0
subscriberCount        0
videoId                0
channelelapsedtime     0
channelCommentCount    0
videoViewCount         0
elapsedtime            0
videoLikeCount         0
videoDislikeCount      0
videoPublished         0
VideoCommentCount      0
dtype: int64

In [11]:
data.skew(numeric_only=True)

index                   -0.000294
videoCategoryId         -0.612887
channelViewCount        23.823911
videoCount              19.071062
subscriberCount         18.181026
channelelapsedtime      -0.159082
channelCommentCount     87.183926
videoViewCount         196.214596
elapsedtime              0.373921
videoLikeCount         105.393997
videoDislikeCount      334.689053
VideoCommentCount      150.638879
dtype: float64

In [12]:
data.dtypes

index                   int64
channelId              object
videoCategoryId         int64
channelViewCount        int64
videoCount              int64
subscriberCount         int64
videoId                object
channelelapsedtime      int64
channelCommentCount     int64
videoViewCount          int64
elapsedtime             int64
videoLikeCount          int64
videoDislikeCount       int64
videoPublished         object
VideoCommentCount       int64
dtype: object

In [13]:
data.corr(numeric_only=True)['videoLikeCount']

index                 -0.001791
videoCategoryId        0.003199
channelViewCount       0.112972
videoCount            -0.000179
subscriberCount        0.212479
channelelapsedtime    -0.010667
channelCommentCount    0.053639
videoViewCount         0.719707
elapsedtime           -0.037340
videoLikeCount         1.000000
videoDislikeCount      0.498198
VideoCommentCount      0.706167
Name: videoLikeCount, dtype: float64

In [14]:
data_copy = data.copy()
y=data['videoLikeCount']
X=data.drop(['videoLikeCount'],axis=1)

ElasticNet as baseline model

In [15]:
with mlflow.start_run():

    mlflow.set_tag("model", "Elastic Net Pipeline")

    mlflow.log_param("train-data-path", "data/YouTubeDataset_withChannelElapsed.csv")

    cat_encoder=ce.CatBoostEncoder(cols=list(X.select_dtypes(include='object').columns))
    qt=QuantileTransformer(output_distribution='normal')
    rs=RobustScaler()
    en=ElasticNet(random_state=123)
    en_pipeline=Pipeline([('Cat_Encoder',cat_encoder),
                    ('Quantile transformer',qt),
                    ('Scaling',rs),
                    ('Elastic Net',en)])
    en_scores=cross_val_score(en_pipeline,X,y,cv=3,scoring='neg_root_mean_squared_error')    

    mlflow.log_metric("rmse", -en_scores.mean())
    mlflow.sklearn.log_model(en_pipeline, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")              

default artifacts URI: 'mlflow-artifacts:/1/20966c64a4d6468c9e968c1ee5e7b342/artifacts'


I will use lightgbm as it can handle categorical features and it works fast too.

LGBM can handle categorical features by changing the data types pf these features to "category". However it is mentioned in the [documentation](http://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html) that for high cardinality datasets it is better to convert the categorical data as numeric

In [15]:
# lightgbm needs categorical features to be of type 'category'

X_lightgbm = X.copy()
for cat_cols in X_lightgbm.select_dtypes(include='object').columns:
    X_lightgbm[cat_cols] = X_lightgbm[cat_cols].astype('category')

X_lightgbm.dtypes

index                     int64
channelId              category
videoCategoryId           int64
channelViewCount          int64
videoCount                int64
subscriberCount           int64
videoId                category
channelelapsedtime        int64
channelCommentCount       int64
videoViewCount            int64
elapsedtime               int64
videoDislikeCount         int64
videoPublished         category
VideoCommentCount         int64
dtype: object

In [50]:
with mlflow.start_run():

    mlflow.set_tag("model", "LightGBM Regressor")
    mlflow.log_param("train-data-path", "data/YouTubeDataset_withChannelElapsed.csv")

    lightgbm_reg=lgb.LGBMRegressor(random_state=123,verbose=-1)
    lightgbm_reg.fit(X_lightgbm,y)
    scores = cross_val_score(lightgbm_reg, X_lightgbm, y, scoring='neg_root_mean_squared_error', cv=3)

    mlflow.log_metric("rmse", -scores.mean())
    mlflow.sklearn.log_model(lightgbm_reg, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



default artifacts URI: 'mlflow-artifacts:/1/f9ff37b0b6c9469f82ae8bf9a281b3be/artifacts'


In [52]:
with mlflow.start_run():

    mlflow.set_tag("model", "LightGBM Regressor Pipeline")
    mlflow.log_param("train-data-path", "data/YouTubeDataset_withChannelElapsed.csv")

    lightgbm_reg=lgb.LGBMRegressor(random_state=123,verbose=-1)
    light_pipeline=Pipeline([('Cat_Encoder',cat_encoder), ('LightGBM',lightgbm_reg)])
    light_pipeline.fit(X_lightgbm,y)
    scores = cross_val_score(light_pipeline, X_lightgbm, y, scoring='neg_root_mean_squared_error', cv=3)

    mlflow.log_metric("rmse", -scores.mean())
    mlflow.sklearn.log_model(light_pipeline, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



default artifacts URI: 'mlflow-artifacts:/1/95b2a8a6727c442f938561468ee988eb/artifacts'


In [20]:
with mlflow.start_run():

    mlflow.set_tag("model", "CatBoost Regressor")
    mlflow.log_param("train-data-path", "data/YouTubeDataset_withChannelElapsed.csv")

    cat =CatBoostRegressor(random_state=123,cat_features=list(X.select_dtypes(include='object').columns),
                           verbose=False)
    cat.fit(X,y)
    scores = cross_val_score(cat, X, y, scoring='neg_root_mean_squared_error', cv=3)

    mlflow.log_metric("rmse", -scores.mean())
    mlflow.sklearn.log_model(cat, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")



default artifacts URI: 'mlflow-artifacts:/1/d7279cc9f2a74888ba5ccf3ac0fd77f4/artifacts'


Let's see which model performed the best

In [15]:
client = MlflowClient("http://127.0.0.1:5000")
runs = client.search_runs(experiment_ids='1',order_by=["metrics.rmse ASC"])

In [16]:
for run in runs:
    print(f"run id: {run.info.run_id}, model name: {run.data.tags['model']},"+
          f"rmse: {run.data.metrics['rmse']:.4f}, duration(s): {(run.info.end_time-run.info.start_time)/1000:.2f}")

run id: 5d4e1c34b7004f218fb70948f0c4fdfd, model name: LightGBM Regressor Pipeline,rmse: 2672.0905, duration(s): 38.20
run id: bbdfddcf7a3f460bba46b24978de3707, model name: LightGBM Regressor,rmse: 2717.1013, duration(s): 12.48
run id: d7279cc9f2a74888ba5ccf3ac0fd77f4, model name: CatBoost Regressor,rmse: 2933.2253, duration(s): 931.68
run id: 20966c64a4d6468c9e968c1ee5e7b342, model name: Elastic Net Pipeline,rmse: 4461.3284, duration(s): 32.15


We can see that the LightGBM Regressor Pipeline performed the best but it took more than double the time of a simple LightGBM Regressor. The simple LGBM Regressor is able to give a similar rmse at a much faster time so let's use that. 

*Note for Hyperparameter Tuning: I will convert my dataset into lightgbm.DataSet() type as it will make the computation more efficient. [More info can be found here.](https://stackoverflow.com/questions/65924856/lightgbm-intent-of-lightgbm-dataset). The hyperparameter tuning process was taking too long so I didn't continue with it.*

In [23]:
run_id = "bbdfddcf7a3f460bba46b24978de3707"
mlflow.register_model(
    model_uri=f"runs:/{run_id}/models",
    name='yt-likes-regressor'
)

Successfully registered model 'yt-likes-regressor'.
2024/01/15 20:35:15 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: yt-likes-regressor, version 1
Created version '1' of model 'yt-likes-regressor'.


<ModelVersion: aliases=[], creation_timestamp=1705379715817, current_stage='None', description='', last_updated_timestamp=1705379715817, name='yt-likes-regressor', run_id='bbdfddcf7a3f460bba46b24978de3707', run_link='', source='mlflow-artifacts:/1/bbdfddcf7a3f460bba46b24978de3707/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [24]:
# check if model was registered

client.search_registered_models()

[<RegisteredModel: aliases={}, creation_timestamp=1705379715706, description='', last_updated_timestamp=1705379715817, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1705379715817, current_stage='None', description='', last_updated_timestamp=1705379715817, name='yt-likes-regressor', run_id='bbdfddcf7a3f460bba46b24978de3707', run_link='', source='mlflow-artifacts:/1/bbdfddcf7a3f460bba46b24978de3707/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>], name='yt-likes-regressor', tags={}>]

In [25]:
model_name = "yt-likes-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

version: 1, stage: None


  latest_versions = client.get_latest_versions(name=model_name)


In [26]:
#moving model to staging
model_version = 1
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1705379715817, current_stage='Staging', description='', last_updated_timestamp=1705380276033, name='yt-likes-regressor', run_id='bbdfddcf7a3f460bba46b24978de3707', run_link='', source='mlflow-artifacts:/1/bbdfddcf7a3f460bba46b24978de3707/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [27]:
client.transition_model_version_stage(
    name=model_name,
    version=1,
    stage="Production",
    archive_existing_versions=True
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1705379715817, current_stage='Production', description='', last_updated_timestamp=1705380478170, name='yt-likes-regressor', run_id='bbdfddcf7a3f460bba46b24978de3707', run_link='', source='mlflow-artifacts:/1/bbdfddcf7a3f460bba46b24978de3707/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>

In [29]:
date = datetime.today().date()

client.update_model_version(
    name=model_name,
    version=1,
    description=f"The model version 1 was transitioned to Production on {date}"
)

<ModelVersion: aliases=[], creation_timestamp=1705379715817, current_stage='Production', description='The model version 1 was transitioned to Production on 2024-01-15', last_updated_timestamp=1705380535205, name='yt-likes-regressor', run_id='bbdfddcf7a3f460bba46b24978de3707', run_link='', source='mlflow-artifacts:/1/bbdfddcf7a3f460bba46b24978de3707/artifacts/models', status='READY', status_message='', tags={}, user_id='', version='1'>