In [1]:
import os
import sys
import numpy as np
import pandas as pd
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline, Pipeline, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn import set_config
import joblib


In [2]:
#sys.path.append(os.path.abspath('../data'))
df = pd.read_csv("../data/processed/clean_data.csv", encoding = 'utf-8', encoding_errors="ignore")
(df.head())

Unnamed: 0.1,Unnamed: 0,Impressions,From Home,From Hashtags,From Explore,From Other,Saves,Comments,Shares,Likes,...,HashtagCount,HashtagDensity,FromHomeRatio,FromHashtagsRatio,FromExploreRatio,FromOtherRatio,IsWeekend,Month,HourOfDay,NumOfHashtags
0,0,3920,2586,1028,619,56,98,9,5,162,...,1,0.111111,0.659694,0.262245,0.157908,0.014286,0,5,22,22
1,1,5394,2727,1838,1174,78,194,7,14,224,...,1,0.055556,0.505562,0.340749,0.217649,0.014461,0,5,3,18
2,2,4021,2085,1188,0,533,41,11,1,131,...,1,0.071429,0.518528,0.295449,0.0,0.132554,1,1,6,18
3,3,4528,2700,621,932,73,172,10,7,213,...,1,0.058824,0.59629,0.137147,0.20583,0.016122,0,6,16,11
4,4,2518,1704,255,279,37,96,5,4,123,...,1,0.0625,0.676728,0.101271,0.110802,0.014694,0,3,17,29


In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.shape

(119, 28)

In [5]:
df.columns

Index(['Impressions', 'From Home', 'From Hashtags', 'From Explore',
       'From Other', 'Saves', 'Comments', 'Shares', 'Likes', 'Profile Visits',
       'Follows', 'Caption', 'Hashtags', 'Timestamp', 'Content Type',
       'CaptionLength', 'DayOfWeek', 'EngagementRate', 'HashtagCount',
       'HashtagDensity', 'FromHomeRatio', 'FromHashtagsRatio',
       'FromExploreRatio', 'FromOtherRatio', 'IsWeekend', 'Month', 'HourOfDay',
       'NumOfHashtags'],
      dtype='object')

In [6]:
cols = ['From Home', 'From Hashtags', 'From Explore',
       'From Other', 'Saves', 'Comments', 'Shares', 
       'Follows', 'EngagementRate', 'HashtagCount', 'FromHomeRatio',
        'FromHashtagsRatio', 'FromExploreRatio', 'FromOtherRatio',]
df = df.drop(labels=cols, axis=1)

In [7]:
df.head(2)

Unnamed: 0,Impressions,Likes,Profile Visits,Caption,Hashtags,Timestamp,Content Type,CaptionLength,DayOfWeek,HashtagDensity,IsWeekend,Month,HourOfDay,NumOfHashtags
0,3920,162,35,important data visualization every financial d...,#finance#money#business#investing#investment#t...,2024-05-14 22:59:29.747899160,carousel,8,1,0.111111,0,5,22,22
1,5394,224,48,best data science project idea healthcare want...,#healthcare#health#covid#data#datascience#data...,2024-05-24 03:49:54.957983192,carousel,17,4,0.055556,0,5,3,18


In [8]:
df.columns, df.shape

(Index(['Impressions', 'Likes', 'Profile Visits', 'Caption', 'Hashtags',
        'Timestamp', 'Content Type', 'CaptionLength', 'DayOfWeek',
        'HashtagDensity', 'IsWeekend', 'Month', 'HourOfDay', 'NumOfHashtags'],
       dtype='object'),
 (119, 14))

In [9]:
label_enc = LabelEncoder()
df["Content Type"] = label_enc.fit_transform(df["Content Type"])

In [10]:
joblib.dump(value=label_enc, filename="../models/ct_label_enc.pkl")

['../models/ct_label_enc.pkl']

In [11]:
df.columns

Index(['Impressions', 'Likes', 'Profile Visits', 'Caption', 'Hashtags',
       'Timestamp', 'Content Type', 'CaptionLength', 'DayOfWeek',
       'HashtagDensity', 'IsWeekend', 'Month', 'HourOfDay', 'NumOfHashtags'],
      dtype='object')

In [12]:
df.head()

Unnamed: 0,Impressions,Likes,Profile Visits,Caption,Hashtags,Timestamp,Content Type,CaptionLength,DayOfWeek,HashtagDensity,IsWeekend,Month,HourOfDay,NumOfHashtags
0,3920,162,35,important data visualization every financial d...,#finance#money#business#investing#investment#t...,2024-05-14 22:59:29.747899160,0,8,1,0.111111,0,5,22,22
1,5394,224,48,best data science project idea healthcare want...,#healthcare#health#covid#data#datascience#data...,2024-05-24 03:49:54.957983192,0,17,4,0.055556,0,5,3,18
2,4021,131,62,learn train machine learning model giving inpu...,#data#datascience#dataanalysis#dataanalytics#d...,2024-01-13 06:27:13.613445378,1,13,5,0.071429,1,1,6,18
3,4528,213,23,here write python program detect whether sente...,#python#pythonprogramming#pythonprojects#pytho...,2024-06-17 16:44:22.184873950,0,16,0,0.058824,0,6,16,11
4,2518,123,8,plotting annotation visualizing data considere...,#datavisualization#datascience#data#dataanalyt...,2024-03-20 17:56:58.487394958,2,15,2,0.0625,0,3,17,29


In [13]:
num_pipeline_columns = ['Impressions', 'Likes', 'Profile Visits',
       'CaptionLength', 'DayOfWeek',
       'HashtagDensity', 'IsWeekend', 'Month', 
       'HourOfDay', 'NumOfHashtags']

In [14]:
def to_datetime(X):
    return pd.to_datetime(X.squeeze(), errors='coerce').values.astype('datetime64[s]').reshape(-1, 1)

timestamp_transformer = FunctionTransformer(to_datetime)


In [15]:
text_features = ['Caption', 'Hashtags']
scale_features = ['CaptionLength', 'DayOfWeek', 'Month',
                  'HourOfDay', 'NumOfHashtags']
passthrough_features = ['HashtagDensity', 'IsWeekend', 'Content Type']
timestamp_feature = ['Timestamp']

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'Caption'),  
        ('hashtags', TfidfVectorizer(), 'Hashtags'),  
        ('scale', StandardScaler(), scale_features)
    ]
)

In [17]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
])

In [18]:
param_grid = {
    'model__n_estimators': [50, 100, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.3],
    'model__subsample': [0.8, 1.0]
}

In [19]:
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=1,
    n_jobs=-1
)

In [40]:
from sklearn.linear_model import Ridge

param_grid = {
    "ridge__alpha": [0.001, 0.01, 0.1, 1, 10, 50, 100, 500, 1000]
}

ridge = Ridge(random_state=42)

In [41]:
pipeline2= Pipeline([
    ("prepro", preprocessor),
    ("ridge",ridge )
])


ridge_grid_search = GridSearchCV(
    pipeline2, 
    param_grid, 
    cv=5, 
    scoring="r2",
    n_jobs=-1
)

In [46]:
from sklearn.ensemble import GradientBoostingRegressor
pipeline3 = Pipeline([
    ("pre", preprocessor),
    ("gbr",GradientBoostingRegressor(random_state=42) )
]) 
 

gbr_param_grid = {
    'gbr__n_estimators': [200, 300, 400],
    'gbr__learning_rate': [0.01, 0.05, 0.1],
    'gbr__max_depth' : [ 3, 4, 5 ],
    'gbr__min_samples_split' :[ 2,3,4],
    'gbr__min_samples_leaf' :[2, 3, 4],
    'gbr__subsample': [0.8, 1]
}

In [47]:
gbr_grid_seacrh = GridSearchCV(
    pipeline3,
    gbr_param_grid,
    cv = 5,
    scoring="r2",
    n_jobs=-1,
    verbose=2
)

In [20]:
sys.path.append(os.path.abspath('../'))

# for impressions 

In [21]:
y_imp =     df["Impressions"]
y_likes = df["Likes"]
y_profile = df["Profile Visits"]

In [22]:
input_cols = [ 'CaptionLength', 'DayOfWeek', 
              'HashtagDensity', 'IsWeekend',
                'Month','HourOfDay', 'Content Type',
                'NumOfHashtags', 'Caption', 'Hashtags'
]

In [23]:
X = df[input_cols]
X.head()

Unnamed: 0,CaptionLength,DayOfWeek,HashtagDensity,IsWeekend,Month,HourOfDay,Content Type,NumOfHashtags,Caption,Hashtags
0,8,1,0.111111,0,5,22,0,22,important data visualization every financial d...,#finance#money#business#investing#investment#t...
1,17,4,0.055556,0,5,3,0,18,best data science project idea healthcare want...,#healthcare#health#covid#data#datascience#data...
2,13,5,0.071429,1,1,6,1,18,learn train machine learning model giving inpu...,#data#datascience#dataanalysis#dataanalytics#d...
3,16,0,0.058824,0,6,16,0,11,here write python program detect whether sente...,#python#pythonprogramming#pythonprojects#pytho...
4,15,2,0.0625,0,3,17,2,29,plotting annotation visualizing data considere...,#datavisualization#datascience#data#dataanalyt...


In [24]:
imp_model = grid_search.fit(X, y_imp)
best_imp_model = imp_model.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [25]:
best_imp = imp_model.best_estimator_
print(best_imp_model)

{'model__learning_rate': 0.01, 'model__max_depth': 7, 'model__n_estimators': 100, 'model__subsample': 0.8}


In [26]:
imp_pred = imp_model.predict(X.iloc[[3]])
print(imp_pred, y_imp.iloc[3])



[4711.8237] 4528


In [27]:
like_model = grid_search.fit(X, y_likes)
like_model.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


{'model__learning_rate': 0.01,
 'model__max_depth': 7,
 'model__n_estimators': 200,
 'model__subsample': 0.8}

In [28]:
like_pred = like_model.predict(X.iloc[[3]])
print(like_pred, y_likes.iloc[3])

[197.7836] 213


In [29]:
profile_model = grid_search.fit(X, y_profile)
profile_model.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


{'model__learning_rate': 0.01,
 'model__max_depth': 5,
 'model__n_estimators': 200,
 'model__subsample': 0.8}

In [30]:
profile_pred = profile_model.predict(X.iloc[[3]])
print(profile_pred, y_profile.iloc[3])

[28.22146] 23


In [42]:
imp_rigde = ridge_grid_search.fit(X, y_imp)
imp_rigde.best_params_

{'ridge__alpha': 10}

In [48]:
gbr_imp = gbr_grid_seacrh.fit(X, y_imp)
gbr_imp.best_params_


Fitting 5 folds for each of 486 candidates, totalling 2430 fits


{'gbr__learning_rate': 0.01,
 'gbr__max_depth': 5,
 'gbr__min_samples_leaf': 4,
 'gbr__min_samples_split': 2,
 'gbr__n_estimators': 200,
 'gbr__subsample': 0.8}

In [31]:
sys.path.append(os.path.abspath('../'))
joblib.dump(value=best_imp, filename="../models/ml_models/best_imp_model.pkl")

['../models/ml_models/best_imp_model.pkl']

In [32]:
joblib.dump(value=like_model, filename="../models/ml_models/like_model.pkl")

joblib.dump(value=profile_model, filename="../models/ml_models/profile_model.pkl")

['../models/ml_models/profile_model.pkl']