In [1]:
import random
from scipy.stats import randint
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
df_all = pd.read_csv("data/data_survey&trace&interests_dummy.csv", index_col=0)
df_all

Unnamed: 0,Date,Link,video_id,in_playlist,in_playlist_final,video_id_playlist,video_id_playlist_final,watching_time,video_duration,video_likes,...,When access TikTok_When getting up in the morning,When access TikTok_When nothing to do,When access TikTok_When traveling,When access TikTok_When using the restroom,When access TikTok_When waiting shortly,common_interests,all_interests,jacc_original_interests,num_topics_video,jacc_interests
0,2023-09-19 12:33:07,https://www.tiktokv.com/share/video/7270933407...,7270933407155277102,1,1,0,0,26.0,24.0,1500000.0,...,0,1,1,1,1,2.0,33.0,0.060606,2.0,1.00
1,2023-09-19 12:33:33,https://www.tiktokv.com/share/video/7266282716...,7266282716625161474,1,1,1,1,3.0,9.0,49100.0,...,0,1,1,1,1,1.0,33.0,0.030303,1.0,1.00
2,2023-09-19 12:33:41,https://www.tiktokv.com/share/video/7256194262...,7256194262050802990,1,1,2,2,15.0,470.0,3373.0,...,0,1,1,1,1,2.0,33.0,0.060606,2.0,1.00
3,2023-09-19 12:33:56,https://www.tiktokv.com/share/video/7255321132...,7255321132495047979,1,1,3,3,114.0,331.0,2900000.0,...,0,1,1,1,1,2.0,33.0,0.060606,2.0,1.00
4,2023-09-19 12:35:50,https://www.tiktokv.com/share/video/7271266223...,7271266223709572354,1,1,6,4,13.0,16.0,378700.0,...,0,1,1,1,1,2.0,33.0,0.060606,2.0,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5130,2023-09-22 16:48:35,https://www.tiktokv.com/share/video/7267772520...,7267772520500186410,1,1,20,18,13.0,12.0,2500000.0,...,1,1,1,1,1,1.0,7.0,0.142857,2.0,0.50
5131,2023-09-22 16:48:48,https://www.tiktokv.com/share/video/7275683226...,7275683226792807682,1,1,21,19,10.0,8.0,151400.0,...,1,1,1,1,1,1.0,9.0,0.111111,4.0,0.25
5132,2023-09-22 16:48:58,https://www.tiktokv.com/share/video/7247911468...,7247911468098751770,1,1,22,20,27.0,25.0,1300000.0,...,1,1,1,1,1,1.0,7.0,0.142857,2.0,0.50
5133,2023-09-22 16:49:25,https://www.tiktokv.com/share/video/7249404952...,7249404952727031066,1,1,23,21,10.0,7.0,441200.0,...,1,1,1,1,1,0.0,8.0,0.000000,2.0,0.00


In [3]:
# One-hot encode the SEQUENTIAL_ID column
one_hot_user_df = pd.get_dummies(df_all['SEQUENTIAL_ID'], prefix='User ID')

df_all_rf = df_all.drop(columns=['Date', 'Link','video_id','in_playlist','in_playlist_final','video_id_playlist', 'watching_time',
                                 'percentage_watched_float','percentage_watched', 'prolific_group','SEQUENTIAL_ID',
                                 'tt_account','like','comment', 'common_interests','all_interests', 'num_topics_video', 
                                 'jacc_original_interests', 'video_saves', 'watched_6s+'])

df_all_rf.rename(columns={"jacc_interests": "Interest Similarity", 
                         'video_id_playlist_final': "Video ID playlist",
                         'video_duration': "Video duration",
                         'video_likes': "Video num. likes",
                         'video_shares': "Video num. shares",
                         'video_comments': "Video num. comments",
                         'video_plays': "Video num. plays"}, inplace=True)

df_all_rf = df_all_rf.merge(one_hot_user_df, left_index=True, right_index=True) # adding one hot encoding user ID"

In [4]:
columns_classify = [c for c in df_all_rf.columns.tolist() if c != 'watched_until_end']

In [5]:
X2 = df_all_rf[columns_classify]
y2 = df_all_rf["watched_until_end"]

In [6]:
# Split the data into training and test sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [7]:
# --- Standardize the Data ---
sc = StandardScaler()
X2_train_scaled = sc.fit_transform(X2_train)
X2_test_scaled = sc.transform(X2_test)

# --- Re-wrap scaled arrays as DataFrames to preserve column names ---
X2_train_scaled_df = pd.DataFrame(X2_train_scaled, columns=columns_classify)
X2_test_scaled_df = pd.DataFrame(X2_test_scaled, columns=columns_classify)

print("Train size:", X2_train_scaled_df.shape)
print("Test size:", X2_test_scaled_df.shape)

Train size: (4108, 166)
Test size: (1027, 166)


In [8]:
names = ["Logistic Regression", 
         "K Nearest Neighbors",
         "SVM",
         "Decision Tree",
         "Random Forest",
         "MLP"]

classifiers = [
    LogisticRegression(random_state=0),
    KNeighborsClassifier(),
    SVC(random_state=4),
    DecisionTreeClassifier(random_state=4),
    RandomForestClassifier(random_state=4),
    MLPClassifier(random_state=4, max_iter=500)
]

params_lr={"penalty":["l2"],
            "solver": ["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
            "C":np.arange(0.025, 1, 0.25),
            "class_weight": ["balanced"]}

params_knn={"n_neighbors":range(3,30)}

params_svm={"kernel":["linear", "poly", "rbf", "sigmoid"], 
            "C":np.arange(0.025, 1, 0.25), 
            "gamma": ["auto", "scale"], 
            "degree":range(1,6,1), 
            "class_weight": ["balanced"]}

params_dt={"max_depth":range(1,50), 
           "min_samples_leaf":range(1,20), 
           "class_weight": ["balanced"]}
           #"criterion":["gini", "entropy"]}

params_rf={"max_depth":range(1,100), 
           "min_samples_leaf":range(2,20),
           "n_estimators":range(10,100,10),
           "class_weight": ["balanced"]}
           #"criterion": ["gini", "entropy", "log_loss"]}

params_mlp={"hidden_layer_sizes":[(i - 2,) for i in range(8,len(X2.columns.tolist()))],
            "learning_rate":["constant"], 
            "alpha":np.arange(0.0001, 0.001, 0.0001)}
            #"activation":["logistic", "tanh", "relu"],
            #"solver": ["lbfgs", "sgd", "adam"]}

parameters_list=[params_lr, params_knn, params_svm, params_dt, params_rf, params_mlp]

In [9]:
i=0
# iterate over classifiers
for name, classifier in zip(names, classifiers):
    rand_search = make_pipeline(StandardScaler(), 
                                RandomizedSearchCV(classifier, 
                                     param_distributions = parameters_list[i], 
                                     n_iter=10, 
                                     cv=5, 
                                     random_state=42,
                                     refit=True)) #Refit an estimator using the best found parameters on the whole dataset.
    i += 1
    
    rand_search.fit(X2_train_scaled_df, y2_train)
    # Generate predictions with the best model
    y2_pred = rand_search.predict(X2_test_scaled_df )
    
    accuracy = accuracy_score(y2_test, y2_pred)
    precision = precision_score(y2_test, y2_pred, average="macro")
    recall = recall_score(y2_test, y2_pred, average="macro")
    F1 = f1_score(y2_test, y2_pred, average="macro")
    
    print(name + "&" + str(round(F1,2)) + "&" + 
          str(round(accuracy,2)) + "&" + 
          str(round(precision,2)) + "&" + 
          str(round(recall,2)))
          

Logistic Regression&0.72&0.74&0.72&0.75
K Nearest Neighbors&0.68&0.74&0.71&0.67
SVM&0.72&0.73&0.71&0.74
Decision Tree&0.7&0.72&0.7&0.72
Random Forest&0.74&0.78&0.75&0.74




MLP&0.72&0.76&0.72&0.72
