In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from gensim import corpora, models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import numpy as np
import pandas as pd
from google.oauth2 import service_account
import pandas_gbq
from google.cloud import bigquery

In [None]:
# Create a BigQuery client
client = bigquery.Client.from_service_account_json('/content/drive/MyDrive/tech-cali-b2c-72b3e690e309-Compute-Engine.json')

# Define your BigQuery table details
project_id = 'tech-cali-b2c'
dataset_id = 'CE_Analytics_Layer'
table_id = 'Creator_Social_Profile'

# Create a reference to the BigQuery table
table_ref = client.dataset(dataset_id).table(table_id)

# Retrieve the data from BigQuery into a DataFrame
df_mod = client.query(f"SELECT * FROM `{project_id}.{dataset_id}.{table_id}`").to_dataframe()

In [None]:
df = df_mod.drop_duplicates(subset=['artist_id'], keep='first')
df = df[df['artist_id'] != ""]

In [None]:
df.iloc[:,3:] = df.iloc[:,3:].fillna('').replace('',0)

In [None]:
cols = ['Creator_ID', 'name', 'country_code', 'genres', 'artist_id']
columns_select = list(df.drop(cols, axis=1).columns)
for col in columns_select:
    df[col] = df[col].astype('float')

In [None]:
df_fi = df.copy()

In [None]:
# Topic Modeling Code: Genre_Score (Topic distribution)
def get_genre_score(topic_distribution):
    most_likely_word = topic_distribution.argmax()
    genre_score = lda.components_[most_likely_word][0]
    return genre_score

# Create a TF-IDF vectorizer : Finds importance of the term
vectorizer = TfidfVectorizer(stop_words="english")

X = vectorizer.fit_transform(df_fi["genres"])

lda = LatentDirichletAllocation(n_components=10, random_state=0)

lda.fit(X)

topic_distributions = lda.transform(X)

major_genres = []
for i in range(len(df_fi)):
    topic_distribution = topic_distributions[i]
    major_genre = get_genre_score(topic_distribution)
    major_genres.append(major_genre)

# for i in range(len(data)):
#   print('Document: ',i, major_genres[i])

df_fi['genre_score'] = major_genres

In [None]:
# Topic Modeling (Major Genre Label):
genres_list = [genres.split(', ') for genres in df_fi['genres']]
dictionary = corpora.Dictionary(genres_list)
corpus = [dictionary.doc2bow(genres) for genres in genres_list]

num_topics = 1  # Number of topics you want to extract (in this case, major genre)
lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary)

# Extract major genre for each row
major_genres = []
for genres in genres_list:
    genre_counts = {}
    for genre in genres:
      genre_counts[genre] = genre_counts.get(genre, 0) + 1
    major_genre = max(genre_counts, key=genre_counts.get)
    major_genres.append(major_genre)

df_fi['major_genre'] = major_genres



**Data Preprocessing**

In [None]:
df_rf = df_fi.drop(['Creator_ID', 'name', 'country_code', 'genres', 'artist_id', 'youtube_daily_video_views','youtube_channel_views',
           'TikTok_Genz_followers_percentage','Instagram_Genz_followers_percentage','TikTok_number_of_top_tracks',
           'TikTok_avg_creations_for_top_n_tracks','TikTok_avg_views_for_top_n_tracks', 'Youtube_Genz_subscribers_percentage'], axis=1)
# df_rf = df_new.drop(df_new.filter(regex='_z_score|_z_score_label|_zscore'), axis=1)



df_filled = df_rf.fillna(df_rf.median())

**Standard Scaling**

In [None]:
scaler = StandardScaler()

scaler.fit(df_filled)

scaled_data = scaler.transform(df_filled)

column_names = scaler.get_feature_names_out()

scaled_df = pd.DataFrame(scaled_data, columns=column_names)

**Train Test Split**

In [None]:
# Separate features and target variable
X = scaled_df.drop('TikTok_total_creations_for_top_n_tracks', axis=1)
y = scaled_df['TikTok_total_creations_for_top_n_tracks']

# Train and test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (2917, 18)
Testing set size: (730, 18)


**Model Building and Hyperparameter Tuning**

In [None]:
# Function to calculate R-Square and RMSE
score_log = pd.DataFrame()
def score(model, name , x_test, y_test, position):
    y_pred = model.predict(x_test)
    score_log.loc[position, "Description"] = name
    score_log.loc[position, "R-Square"] = round(r2_score(y_test, y_pred), 3)
    score_log.loc[position, "RMSE"] = round(np.sqrt(mean_squared_error(y_test, y_pred)), 3)
    return score_log

In [None]:
# Linear Regression
lr = LinearRegression()

scores = cross_val_score(lr, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores = abs(scores)

lr.fit(X_train, y_train)

# score(lr, "LR - Before Tuning - Train Error", X_train, y_train, 1)
score(lr, "LR - Test Error", X_test, y_test, 1)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846


In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)

# score(dtr, "DT - Before Tuning - Train Error", X_train, y_train, 3)
score(dtr, "DT - Test Error", X_test, y_test, 2)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846
2,DT - Test Error,0.107,1.142


In [None]:
# Random Forest
rf_1 = RandomForestRegressor()
rf_1.fit(X_train, y_train)

scores = cross_val_score(rf_1, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores = abs(scores)

# score(rf_1, "RF - Before Tuning - Train Error", X_train, y_train, 3)
score(rf_1, "RF - Before Tuning - Test Error", X_test, y_test, 3)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846
2,DT - Test Error,0.107,1.142
3,RF - Before Tuning - Test Error,0.439,0.905


In [None]:
#Random Forest Tuning
param_grid = [{"n_estimators": [100, 200, 300, 400, 500],
              "max_depth": [7, 9, 12, 15],
              "min_samples_split": [5, 7, 12]}]

rf_t = RandomForestRegressor()
grid_cv = GridSearchCV(rf_t, param_grid, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

best_estimator1 = grid_cv.best_estimator_

scores = cross_val_score(best_estimator1, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores = np.abs(scores)

# score(best_estimator1, "RF - After Tuning - Train error", X_train, y_train, 5)
score(best_estimator1, "RF - After Tuning - Test error", X_test, y_test, 4)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846
2,DT - Test Error,0.107,1.142
3,RF - Before Tuning - Test Error,0.439,0.905
4,RF - After Tuning - Test error,0.474,0.876


In [None]:
#XGBoost
xg_boost = xgb.XGBRegressor()
xg_boost.fit(X_train, y_train)

scores = cross_val_score(xg_boost, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores = abs(scores)

# score(xg_boost, "XGB - Before Tuning - Train Error", X_train, y_train, 7)
score(xg_boost, "XGB - Before Tuning - Test Error", X_test, y_test, 5)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846
2,DT - Test Error,0.107,1.142
3,RF - Before Tuning - Test Error,0.439,0.905
4,RF - After Tuning - Test error,0.474,0.876
5,XGB - Before Tuning - Test Error,0.404,0.933


In [None]:
#XGBoost Tuning

param_grid = [{'n_estimators': [100, 200, 300],
              'max_depth': [3, 5, 7, 9],
              'learning_rate': [0.01, 0.001]}]

xg_b_t = xgb.XGBRegressor()
grid_cv = GridSearchCV(xg_b_t, param_grid, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

best_estimator2 = grid_cv.best_estimator_

scores = cross_val_score(best_estimator2, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
scores = np.abs(scores)

# score(best_estimator2, "XGB - After Tuning - Train error", X_train, y_train, 9)
score(best_estimator2, "XGB - After Tuning - Test error", X_test, y_test, 6)

Unnamed: 0,Description,R-Square,RMSE
1,LR - Test Error,0.51,0.846
2,DT - Test Error,0.107,1.142
3,RF - Before Tuning - Test Error,0.439,0.905
4,RF - After Tuning - Test error,0.474,0.876
5,XGB - Before Tuning - Test Error,0.404,0.933
6,XGB - After Tuning - Test error,0.478,0.873


In [None]:
#Model with the highest R-Square
model_name = [lr, dtr, rf_1, best_estimator1, xg_boost, best_estimator2]
r_square = score_log['R-Square']

zipped = zip(model_name,r_square)

best_model = max(zipped, key = lambda i : i[1])
model_name = best_model[0]

model_name

**Feature Importance**

In [None]:
def sampled_shapley(model, X, num_samples=1000):

    if isinstance(X, pd.DataFrame):
        X = X.values

    num_instances, num_features = X.shape
    feature_scores = np.zeros(num_features)

    baseline = np.mean(model.predict(X))

    for i in range(num_features):
        # Initialize the cumulative sum of feature importance scores
        cumulative_importance = 0.0

        for j in range(num_samples):
            # Generate a random permutation of the instance indices
            perm = np.random.permutation(num_instances)

            # Copy the original input data
            X_perm = X.copy()

            # Shuffle the values of the current feature according to the permutation
            X_perm[:, i] = X_perm[perm, i]

            # Calculate the prediction for the permuted data
            prediction = np.mean(model.predict(X_perm))

            # Calculate the contribution of the current feature to the prediction
            contribution = abs(prediction - baseline)

            # Accumulate the contribution to the cumulative sum
            cumulative_importance += contribution

        # Calculate the average contribution for the current feature
        feature_scores[i] = cumulative_importance / num_samples

    return feature_scores

In [None]:
feature_scores = sampled_shapley(model_name, X_test, num_samples=1000)

feature_imp = pd.DataFrame([feature_scores], columns = X.columns)

feature_imp = feature_imp.reset_index(drop=True)

feature_imp

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Unnamed: 0,spotify_followers,spotify_monthly_listeners,facebook_likes,facebook_talks,twitter_followers,twitter_retweets,youtube_monthly_video_views,TikTok_followers,TikTok_Genz_followers,TikTok_avg_engagements_per_post,TikTok_total_views_for_top_n_tracks,Instagram_followers,Instagram_Genz_followers,Instagram_avg_engagements_per_post,Youtube_subscribers,Youtube_Genz_subscribers,Youtube_avg_engagements_per_post,genre_score
0,3.864964e-18,5.761017e-18,3.998538e-18,4.683753e-18,3.474651e-18,3.533632e-18,3.941292e-18,4.147724e-18,3.330669e-18,4.340278e-18,4.768755e-18,3.864964e-18,4.626508e-18,5.048045e-18,5.719383e-18,3.670675e-18,4.402728e-18,3.649858e-18


In [None]:
# Define your BigQuery project ID and credentials
project_id = 'tech-cali-b2c'
credentials = service_account.Credentials.from_service_account_file('/content/drive/MyDrive/tech-cali-b2c-72b3e690e309-Compute-Engine.json')

# Define the BigQuery table name and dataset ID
dataset_id = 'CE_ML_Layer'
table_name = 'Feature_Importance'

# Write the dataframe to BigQuery
pandas_gbq.to_gbq(feature_imp, f'{dataset_id}.{table_name}', project_id=project_id, if_exists='replace', credentials=credentials)

print('Data moved to BigQuery successfully!')

100%|██████████| 1/1 [00:00<00:00, 7913.78it/s]

Data moved to BigQuery successfully!



