In [212]:
import numpy as np
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
from google.cloud import bigquery
import re
from sklearn.preprocessing import LabelEncoder

In [213]:
# Create a BigQuery client
client = bigquery.Client.from_service_account_json('/content/drive/MyDrive/tech-cali-b2c-72b3e690e309-Compute-Engine.json')

# Define your BigQuery table details
project_id = 'tech-cali-b2c'
dataset_id = 'CE_Analytics_Layer'
table_id = 'Creator_Social_Profile'

# Create a reference to the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)

# Retrieve the data from BigQuery into a DataFrame
df_mod = client.query(f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`").to_dataframe()

In [214]:
df = df_mod.drop_duplicates(subset=['artist_id'], keep='first')
df = df[df['artist_id'] != ""]

In [215]:
df.iloc[:,3:] = df.iloc[:,3:].fillna(0)

In [217]:
cols = ['Creator_ID', 'name', 'country_code', 'genres', 'artist_id']
columns_select = list(df.drop(cols, axis=1).columns)
for col in columns_select:
    df[col] = df[col].astype('float')

In [224]:
df_fi = df.copy()

**Data Preprocessing**

In [231]:
# Label Encoding for Country code
le = LabelEncoder()
df_fi['country_code_label'] = le.fit_transform(df_fi['country_code'])

In [232]:
# Genre Score
genre_mapping = {'pop' : r'\b\w*pop\w*\b',
                'hip-hop': r'\b\w*hip-hop\w*\b',
                'hip-hop': r'\b\w*hip hop\w*\b',
                'rap': r'\b\w*rap\w*\b',
                'jazz': r'\b\w*jazz\w*\b',
                'rock': r'\b\w*rock\w*\b',
                'latin':r'\b\w*latin\w*\b',
                'psychedelic':r'\b\w*psychedelic\w*\b',
                'punk':r'\b\w*punk\w*\b',
                'metal':r'\b\w*metal\w*\b',
                'reggae': r'\b\w*reggae\w*\b'}

def map_genres(genre_list):
    new_lst = []
    for value in genre_list.split(','):
        for genre, pattern in genre_mapping.items():
            if re.search(pattern, value, flags=re.IGNORECASE):
                new_lst.append(genre.strip())
                break
        else:
            new_lst.append(value.strip())
    return ",".join(new_lst)

df_fi['New_Genres'] = df_fi['genres'].apply(map_genres)

In [233]:
unique_values = sorted(df_fi['New_Genres'].str.split(',').explode().unique())
i = len(unique_values)
print(i)

1117


In [234]:
value_mapping = {genre:i+1 for i, genre in enumerate(unique_values)}
def calculate_genre_score(genre_list):
    genres = genre_list.split(',')
    return sum(value_mapping.get(genre, 0) for genre in genres)

df_fi['Genre_Score'] = df_fi['New_Genres'].apply(calculate_genre_score)

In [235]:
df_new = df_fi.drop(['Creator_ID', 'name', 'country_code', 'genres', 'New_Genres', 'artist_id'], axis=1)
df_rf = df_new.drop(df_new.filter(regex='_z_score|_z_score_label|_zscore'), axis=1)

df_filled = df_rf.fillna(df_rf.median())

In [236]:
# # Mice imputation
# from fancyimpute import IterativeImputer

# data = df_rf.values

# imputer = IterativeImputer()
# imputed_data = imputer.fit_transform(data)

# df_filled = pd.DataFrame(imputed_data, columns=df_rf.columns)

In [237]:
# Multicollinearity Check using correlation matrix

correlation_matrix = df_filled.corr()
correlation_matrix

In [238]:
# Multicollinearity Check using VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data['Features'] = df_filled.columns
vif_data['Variance_Inflation_Factor'] = [variance_inflation_factor(df_filled.values, i) for i in range(df_filled.shape[1])]

In [239]:
vif_data

Unnamed: 0,Features,Variance_Inflation_Factor
0,spotify_followers,4.885527
1,spotify_monthly_listeners,4.891506
2,facebook_likes,3.61812
3,facebook_talks,1.222217
4,twitter_followers,2.445107
5,twitter_retweets,1.032242
6,youtube_channel_views,10.172065
7,youtube_daily_video_views,73.431151
8,youtube_monthly_video_views,75.075265
9,TikTok_followers,146.134777


In [240]:
# Cannon imputation
# from impyute.imputation.cs import fast_knn

# imputed_data = fast_knn(data, k=5)
# df_filled = pd.DataFrame(imputed_data, columns=df_rf.columns)

**Feature Selection - PCA**

In [242]:
from sklearn.decomposition import PCA

n_components = df_filled.shape[1]
pca = PCA(n_components=n_components)

principal_components = pca.fit_transform(df_filled)

components_df = pd.DataFrame(
    data=principal_components,
    columns=['PC{}'.format(i) for i in range(1, n_components + 1)]
)

explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio_cumulative = np.cumsum(explained_variance_ratio)

# for i, ratio in enumerate(explained_variance_ratio, 1):
#     print(f'PC{i}: {ratio:.4f}')

for i, ratio in enumerate(explained_variance_ratio_cumulative, 1):
    print(f'PC{i}: {ratio:.4f}')

PC1: 0.7858
PC2: 0.9997
PC3: 0.9999
PC4: 1.0000
PC5: 1.0000
PC6: 1.0000
PC7: 1.0000
PC8: 1.0000
PC9: 1.0000
PC10: 1.0000
PC11: 1.0000
PC12: 1.0000
PC13: 1.0000
PC14: 1.0000
PC15: 1.0000
PC16: 1.0000
PC17: 1.0000
PC18: 1.0000
PC19: 1.0000
PC20: 1.0000
PC21: 1.0000
PC22: 1.0000
PC23: 1.0000
PC24: 1.0000
PC25: 1.0000
PC26: 1.0000
PC27: 1.0000
PC28: 1.0000


In [243]:
threshold = 0.90

loadings = pca.components_

num_components = np.sum(explained_variance_ratio_cumulative <= threshold) + 1

important_features = components_df.columns[:num_components]

print(f"Important Features: {', '.join(important_features)}")

loadings_abs = np.abs(loadings) 
important_features_indices = np.argsort(loadings_abs, axis=1)[:, ::-1][:, :num_components]  
important_features_names = df_filled.columns[important_features_indices.flatten()].tolist()
important_features_names = np.array(important_features_names).reshape((num_components, -1))

print(len(important_features))

Important Features: PC1, PC2
2


In [244]:
df_pp = df_filled.drop(['youtube_daily_video_views','youtube_channel_views',
           'TikTok_Genz_followers_percentage','Instagram_Genz_followers_percentage', 'Youtube_Genz_subscribers_percentage','TikTok_number_of_top_tracks',
           'TikTok_avg_creations_for_top_n_tracks','TikTok_avg_views_for_top_n_tracks'], axis=1)

**Standard Scaling**

In [245]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(df_pp)

scaled_data = scaler.transform(df_pp)

column_names = scaler.get_feature_names_out()

scaled_df = pd.DataFrame(scaled_data, columns=column_names)

**Train Test Split, Model Building and Hyperparameter Tuning**

In [246]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

X = scaled_df.drop('TikTok_total_creations_for_top_n_tracks', axis=1)
y = scaled_df['TikTok_total_creations_for_top_n_tracks']

test_size_lst = [0.1, 0.2, 0.3, 0.4, 0.5]  
test_size_lr = {}
test_size_dtr = {}
test_size_rf1 = {}
test_size_rft = {}
test_size_xgb1 = {}
test_size_xgbt = {}
test_size_ridge = {}
test_size_lasso = {}
test_size_svm = {}
test_size_enet = {}

for test_size in test_size_lst:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=123)
    
    lr = LinearRegression()
    scores_lr = cross_val_score(lr, X_train, y_train, cv=5)
    avg_score_lr = scores_lr.mean()
    test_size_lr[test_size] = avg_score_lr
    
    ridge = Ridge()
    scores_ridge = cross_val_score(ridge, X_train, y_train, cv=5)
    avg_score_ridge = scores_ridge.mean()
    test_size_ridge[test_size] = avg_score_ridge
    
    lasso = Lasso()
    scores_lasso = cross_val_score(lasso, X_train, y_train, cv=5)
    avg_score_lasso = scores_lasso.mean()
    test_size_lasso[test_size] = avg_score_lasso
    
    alpha = 0.5  
    l1_ratio = 0.5  
    enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio)
    scores_enet = cross_val_score(enet, X_train, y_train, cv=5)
    avg_score_enet = scores_enet.mean()
    test_size_enet[test_size] = avg_score_enet
    
    svm = SVR(kernel="linear")
    scores_svm = cross_val_score(svm, X_train, y_train, cv=5)
    avg_score_svm = scores_svm.mean()
    test_size_svm[test_size] = avg_score_svm
    
    dtr = DecisionTreeRegressor()
    scores_dtr = cross_val_score(dtr, X_train, y_train, cv=5)
    avg_score_dtr = scores_dtr.mean()
    test_size_dtr[test_size] = avg_score_dtr
    
    rf_1 = RandomForestRegressor()
    scores_rf_1 = cross_val_score(rf_1, X_train, y_train, cv=5)
    avg_score_rf_1 = scores_rf_1.mean()
    test_size_rf1[test_size] = avg_score_rf_1
    
    param_grid = [{"n_estimators": [100, 200, 300, 400, 500],
              "max_depth": [7, 9, 12, 15],
              "min_samples_split": [5, 7, 12]}]

    rf_t = RandomForestRegressor()
    grid_cv_1 = GridSearchCV(rf_t, param_grid, cv=2, n_jobs=-1)
    scores_rf_t = cross_val_score(grid_cv_1, X_train, y_train, cv=5)
    avg_score_rf_t = scores_rf_t.mean()
    test_size_rft[test_size] = avg_score_rf_t
    
    xg_boost = xgb.XGBRegressor()
    scores_xgb = cross_val_score(xg_boost, X_train, y_train, cv=5)
    avg_score_xgb = scores_xgb.mean()
    test_size_xgb1[test_size] = avg_score_xgb
    
    param_grid = [{'n_estimators': [100, 200, 300],
              'max_depth': [3, 5, 7, 9],
              'learning_rate': [0.01, 0.001]}]

    xg_b_t = xgb.XGBRegressor()
    grid_cv_2 = GridSearchCV(xg_b_t, param_grid, cv=2, n_jobs=-1)
    scores_xgb_t = cross_val_score(grid_cv_2, X_train, y_train, cv=5)
    avg_score_xgb_t = scores_xgb_t.mean()
    test_size_xgbt[test_size] = avg_score_xgb_t

In [247]:
test_size_rft

{0.1: 0.5580469793496854,
 0.2: 0.5878548535886408,
 0.3: 0.5738202202751733,
 0.4: 0.5456194649593641,
 0.5: 0.46634906999830295}

In [248]:
ts_lr, cv_lr = max(test_size_lr.items(), key=lambda x: x[1])
ts_ridge, cv_ridge = max(test_size_ridge.items(), key=lambda x: x[1])
ts_lasso, cv_lasso = max(test_size_lasso.items(), key=lambda x: x[1])
ts_enet, cv_enet = max(test_size_enet.items(), key=lambda x: x[1])
ts_svm, cv_svm = max(test_size_svm.items(), key=lambda x: x[1])
ts_dtr, cv_dtr = max(test_size_dtr.items(), key=lambda x: x[1])
ts_rf1, cv_rf1 = max(test_size_rf1.items(), key=lambda x: x[1])
ts_rft, cv_rft = max(test_size_rft.items(), key=lambda x: x[1])
ts_xgb1, cv_xgb1 = max(test_size_xgb1.items(), key=lambda x: x[1])
ts_xgbt, cv_xgbt = max(test_size_xgbt.items(), key=lambda x: x[1])

In [249]:
test_size_lr.items()

dict_items([(0.1, 0.5347963534627089), (0.2, 0.5469948246315447), (0.3, 0.5431233075156031), (0.4, 0.5389095977075515), (0.5, 0.48314856957127095)])

In [250]:
model_scores = {
    cv_lr: (lr, "Linear Regression", cv_lr, ts_lr),
    cv_ridge: (ridge, "Ridge Regression", cv_ridge, ts_ridge),
    cv_lasso: (lasso, "Lasso Regression", cv_lasso, ts_lasso),
    cv_enet : (enet, "Elastic Net Regression", cv_enet, ts_enet),
    cv_svm: (svm, "SVM", cv_svm, ts_svm),
    cv_dtr: (dtr, "Decision Tree", cv_dtr, ts_dtr),
    cv_rf1: (rf_1, "Random Forest (Before Tuning)", cv_rf1, ts_rf1),
    cv_rft: (grid_cv_1, "Random Forest (After Tuning)", cv_rft, ts_rft),
    cv_xgb1: (xg_boost, "XGBoost (Before Tuning)", cv_xgb1, ts_xgb1),
    cv_xgbt: (grid_cv_2, "XGBoost (After Tuning)", cv_xgbt, ts_xgbt)
}

best_score = max(model_scores.keys())
best_model, best_model_name, best_score, best_test_size = model_scores[best_score]

In [251]:
# Function to calculate Metrics
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

score_log = pd.DataFrame()
def score(model, name , x_test, y_test, position):
    y_pred = model.predict(x_test)
    score_log.loc[position, "Model"] = best_model_name
    score_log.loc[position, "R-Square"] = round(r2_score(y_test, y_pred), 3)
    score_log.loc[position,"MSE"] = round(mean_squared_error(y_test, y_pred), 3)
    score_log.loc[position, "RMSE"] = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3)
    score_log.loc[position, "MAE"] = round(mean_absolute_error(y_test, y_pred), 3)
    return score_log

In [252]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=best_test_size, random_state=123)

best_model.fit(X_train, y_train)

score(best_model," ", X_test, y_test, 1)

Unnamed: 0,Model,R-Square,MSE,RMSE,MAE
1,Random Forest (After Tuning),0.301,0.308,0.555,0.218


**Feature Importance**

In [253]:
import shap
import numpy as np
def model_predict(X):
    return best_model.predict(X)

explainer = shap.Explainer(model_predict, X)

shap_values = explainer.shap_values(X)

feature_scores = np.abs(shap_values).mean(axis=0)

Permutation explainer: 3649it [04:15, 13.73it/s]                                                                       


In [255]:
# feature_scores = best_model.feature_importances_

feature_imp = pd.DataFrame([feature_scores], columns = X.columns)

feature_imp = feature_imp.reset_index(drop=True)

feature_imp

Unnamed: 0,spotify_followers,spotify_monthly_listeners,facebook_likes,facebook_talks,twitter_followers,twitter_retweets,youtube_monthly_video_views,TikTok_followers,TikTok_Genz_followers,TikTok_avg_engagements_per_post,TikTok_total_views_for_top_n_tracks,Instagram_followers,Instagram_Genz_followers,Instagram_avg_engagements_per_post,Youtube_subscribers,Youtube_Genz_subscribers,Youtube_avg_engagements_per_post,country_code_label,Genre_Score
0,0.019816,0.016518,0.008453,0.0032,0.003826,0.001508,0.00919,0.008041,0.00957,0.002914,0.417158,0.017655,0.01928,0.013025,0.009894,0.024955,0.008373,0.002144,0.025692


In [None]:
# Define your BigQuery project ID and credentials
project_id = 'tech-cali-b2c'
credentials = service_account.Credentials.from_service_account_file('/content/drive/MyDrive/tech-cali-b2c-72b3e690e309-Compute-Engine.json')

# Define the BigQuery table name and dataset ID
dataset_id = 'CE_ML_Layer'
table_name = 'Feature_Importance'

# Write the dataframe to BigQuery
pandas_gbq.to_gbq(feature_imp, f'{dataset_id}.{table_name}', project_id=project_id, if_exists='replace', credentials=credentials)

print('Data moved to BigQuery successfully!')