# CS4242 Lab 3: Viral Item Prediction in Social Networks

### Baseline model

In [1]:
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.preprocessing import scale

In [30]:

def load_social_features(video_id, video_user, user_details):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    for line in open(user_details,encoding='utf-8'):
        data = line.strip().split("::::")
        # You should modify here to add more user social information
        #here we only use two user social infomation: loops and followers. You should consider more user social information. For more details about other social information, pls refer to ./data/README.txt -> 4.user_details.txt 
        social_features[data[0]] = [float(i) for i in data[1:3]] 

    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt')
    
    # feature dimension reduction: it's up to you to decide the size of reduced dimensions; the main purpose is to reduce the computation complexity
    pca = PCA(n_components=20)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=40)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=10)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    kf = KFold(n_splits=10)
    nMSEs = []
    for train, test in kf.split(concat_feature):
        # model initialize: you can tune the parameters within SVR(http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
        model = SVR()
        # train
        model.fit(concat_feature[train], ground_truth[train])
        # predict
        predicts = model.predict(concat_feature[test])
#         print(predicts)
        # nMSE(normalized Mean Squared Error) metric calculation
        nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
        nMSEs.append(nMSE)
    
        print("This round of nMSE is: %f" %(nMSE))
    
    print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...
The input data dimension is: (10000, 122)
Start training and predict...
This round of nMSE is: 0.997951
This round of nMSE is: 0.997615
This round of nMSE is: 0.996097
This round of nMSE is: 0.999905
This round of nMSE is: 0.999956
This round of nMSE is: 0.999047
This round of nMSE is: 0.997943
This round of nMSE is: 0.999909
This round of nMSE is: 0.997744
This round of nMSE is: 0.999820
Average nMSE is 0.998599.


## Add more social features and video info

In [16]:

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, ###
    ### post count of the user, twitter verified flag and user description sentiment score ###
    ### I also add video description sentiment score here. ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    concat_feature = scale(concat_feature)
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    kf = KFold(n_splits=10)
    nMSEs = []
    for train, test in kf.split(concat_feature):
        # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
        model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
        # train
        model.fit(concat_feature[train], ground_truth[train])
        # predict
        predicts = model.predict(concat_feature[test])
        # nMSE(normalized Mean Squared Error) metric calculation
        nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
        nMSEs.append(nMSE)
    
        print("This round of nMSE is: %f" %(nMSE))
    
    print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...




The input data dimension is: (10000, 29)
Start training and predict...
This round of nMSE is: 0.801535
This round of nMSE is: 0.937035
This round of nMSE is: 0.695990
This round of nMSE is: 0.952877
This round of nMSE is: 0.997828
This round of nMSE is: 0.986332
This round of nMSE is: 0.921651
This round of nMSE is: 0.998715
This round of nMSE is: 0.790609
This round of nMSE is: 0.972928
Average nMSE is 0.905550.


## Tune SVR

In [15]:
from sklearn.model_selection import GridSearchCV

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, post count of the user, twitter verified flag and user description sentiment score ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    concat_feature = scale(concat_feature)
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-4,1e-5,1e-6],'C': [100000,500000,1000000,5000000], 'epsilon':[1e-2,1e-3,1e-4]}]
    kf = KFold(n_splits=10)
    GSCV = GridSearchCV(SVR(), tuned_parameters, cv=kf, scoring='neg_mean_squared_error',verbose=2)
    GSCV.fit(concat_feature, ground_truth)

    print("Best parameters set found on development set:",GSCV.best_params_)
    print("Best score:",GSCV.best_score_)


if __name__ == "__main__":
    main()


Loading data...




The input data dimension is: (10000, 29)
Start training and predict...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  10.9s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.7s remaining:    0.0s


[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  10.9s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  11.1s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  12.4s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  11.1s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  11.0s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  12.2s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] . C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  11.0s
[CV] C=100000, epsilon=0.01, gamma=0.0001, kernel=rbf ................
[CV] .

[CV] . C=100000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=36.0min
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  10.1s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  12.0s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  10.6s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  11.2s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  12.8s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............
[CV]  C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf, total=  13.2s
[CV] C=100000, epsilon=0.0001, gamma=0.0001, kernel=rbf ..............


[CV] .. C=500000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   9.1s
[CV] C=500000, epsilon=0.01, gamma=1e-06, kernel=rbf .................
[CV] .. C=500000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   9.1s
[CV] C=500000, epsilon=0.01, gamma=1e-06, kernel=rbf .................
[CV] .. C=500000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   9.1s
[CV] C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf ...............
[CV]  C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf, total=  20.8s
[CV] C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf ...............
[CV]  C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf, total=  19.1s
[CV] C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf ...............
[CV]  C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf, total=  19.1s
[CV] C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf ...............
[CV]  C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf, total=  20.5s
[CV] C=500000, epsilon=0.001, gamma=0.0001, kernel=rbf ...............
[CV]  

[CV]  C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.6s
[CV] C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf ...............
[CV]  C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.5s
[CV] C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf ...............
[CV]  C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.6s
[CV] C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf ...............
[CV]  C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.5s
[CV] C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf ...............
[CV]  C=500000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.8s
[CV] C=1000000, epsilon=0.01, gamma=0.0001, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  24.9s
[CV] C=1000000, epsilon=0.01, gamma=0.0001, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.01, gamma=0.0001, kernel=rbf, total=  25.3s
[CV] C=1000000, epsilon=0.01, gamma=0.0001, kernel=rbf ...............
[CV]  

[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   7.6s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   7.9s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   7.6s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   8.0s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   8.1s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   8.0s
[CV] C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf ...............
[CV]  C=1000000, epsilon=0.001, gamma=1e-06, kernel=rbf, total=   8.0s
[CV] C=1000000, epsilon=0.0001, gamma=0.0001, kernel=rbf .............
[CV]  

[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.6s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.7s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.6s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   9.2s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.5s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.5s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] . C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf, total=   8.7s
[CV] C=5000000, epsilon=0.01, gamma=1e-06, kernel=rbf ................
[CV] .

[CV]  C=5000000, epsilon=0.0001, gamma=1e-05, kernel=rbf, total=  14.4s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.2s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.4s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.5s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.3s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.4s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............
[CV]  C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf, total=   7.3s
[CV] C=5000000, epsilon=0.0001, gamma=1e-06, kernel=rbf ..............

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 236.3min finished


Best parameters set found on development set: {'C': 5000000, 'epsilon': 0.01, 'gamma': 1e-05, 'kernel': 'rbf'}
Best score: -17586267093.943794


## Other regressors

**Ridge regression**

In [49]:
from sklearn import linear_model

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, post count of the user, twitter verified flag and user description sentiment score ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    concat_feature = scale(concat_feature)
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    for alpha in range(60,80):
        kf = KFold(n_splits=10)
        nMSEs = []
        for train, test in kf.split(concat_feature):
            # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
            model = linear_model.Ridge(alpha=alpha/10, normalize=True)
            # train
            model.fit(concat_feature[train], ground_truth[train])
            # predict
            predicts = model.predict(concat_feature[test])
            # nMSE(normalized Mean Squared Error) metric calculation
            nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
            nMSEs.append(nMSE)

    #         print("This round of nMSE is: %f" %(nMSE))

        print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...
The input data dimension is: (10000, 28)
Start training and predict...
Average nMSE is 0.917273.
Average nMSE is 0.916952.
Average nMSE is 0.916675.
Average nMSE is 0.916437.
Average nMSE is 0.916237.
Average nMSE is 0.916071.
Average nMSE is 0.915938.
Average nMSE is 0.915836.
Average nMSE is 0.915762.
Average nMSE is 0.915715.
Average nMSE is 0.915693.
Average nMSE is 0.915695.
Average nMSE is 0.915719.
Average nMSE is 0.915763.
Average nMSE is 0.915827.
Average nMSE is 0.915909.
Average nMSE is 0.916007.
Average nMSE is 0.916122.
Average nMSE is 0.916251.
Average nMSE is 0.916395.


**LASSO Regression**

In [27]:
from sklearn import linear_model

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, post count of the user, twitter verified flag and user description sentiment score ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    concat_feature = scale(concat_feature)
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    for alpha in range(360,400):
        kf = KFold(n_splits=10)
        nMSEs = []
        for train, test in kf.split(concat_feature):
            # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
            model = linear_model.Lasso(alpha=alpha, normalize=True)
            # train
            model.fit(concat_feature[train], ground_truth[train])
            # predict
            predicts = model.predict(concat_feature[test])
            # nMSE(normalized Mean Squared Error) metric calculation
            nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
            nMSEs.append(nMSE)

    #         print("This round of nMSE is: %f" %(nMSE))

        print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...




The input data dimension is: (10000, 29)
Start training and predict...
Average nMSE is 0.952235.
Average nMSE is 0.951933.
Average nMSE is 0.951646.
Average nMSE is 0.951374.
Average nMSE is 0.951117.
Average nMSE is 0.950875.
Average nMSE is 0.950648.
Average nMSE is 0.950436.
Average nMSE is 0.950239.
Average nMSE is 0.950057.
Average nMSE is 0.949890.
Average nMSE is 0.949738.
Average nMSE is 0.949601.
Average nMSE is 0.949479.
Average nMSE is 0.949372.
Average nMSE is 0.949280.
Average nMSE is 0.949203.
Average nMSE is 0.949142.
Average nMSE is 0.949095.
Average nMSE is 0.949063.
Average nMSE is 0.949046.
Average nMSE is 0.949044.
Average nMSE is 0.949057.
Average nMSE is 0.949085.
Average nMSE is 0.949128.
Average nMSE is 0.949186.
Average nMSE is 0.949259.
Average nMSE is 0.949347.
Average nMSE is 0.949451.
Average nMSE is 0.949569.
Average nMSE is 0.949702.
Average nMSE is 0.949850.
Average nMSE is 0.950013.
Average nMSE is 0.950191.
Average nMSE is 0.950384.
Average nMSE is 0.9

**random forest**

In [33]:
from sklearn import ensemble
def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, post count of the user, twitter verified flag and user description sentiment score ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    kf = KFold(n_splits=10)
    nMSEs = []
    for train, test in kf.split(concat_feature):
        # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
        model = ensemble.RandomForestRegressor(n_estimators=500,max_features=10,random_state=1)
        # train
        model.fit(concat_feature[train], ground_truth[train])
        # predict
        predicts = model.predict(concat_feature[test])
        # nMSE(normalized Mean Squared Error) metric calculation
        nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
        nMSEs.append(nMSE)
        print("This round of nMSE is: %f" %(nMSE))
    print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...
The input data dimension is: (10000, 29)
Start training and predict...
This round of nMSE is: 1.531933
This round of nMSE is: 1.775225
This round of nMSE is: 2.027293
This round of nMSE is: 0.678454
This round of nMSE is: 0.990258
This round of nMSE is: 2.110597
This round of nMSE is: 2.133469
This round of nMSE is: 1.413684
This round of nMSE is: 0.612467
This round of nMSE is: 0.611210
Average nMSE is 1.388459.


**Gradient boosting**

In [31]:
from sklearn import ensemble

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, post count of the user, twitter verified flag and user description sentiment score ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    tuned_parameters = [{'learning_rate': [0.01,0.05,0.1], 'max_depth': [2,3,4,5],'n_estimators': [100,250,500]}]
    kf = KFold(n_splits=10)
    GSCV = GridSearchCV(ensemble.GradientBoostingRegressor(), tuned_parameters, cv=kf, scoring='neg_mean_squared_error',verbose=2)
    GSCV.fit(concat_feature, ground_truth)

    print("Best parameters set found on development set:",GSCV.best_params_)
    print("Grid scores on development set:",GSCV.best_score_)

if __name__ == "__main__":
    main()


Loading data...
The input data dimension is: (10000, 29)
Start training and predict...
Fitting 10 folds for each of 36 candidates, totalling 360 fits
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.1s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.1s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=2, n_estimators=100, total=   1.0s
[CV] learning_rate=0.01, max_depth=2, n_estimators=100 ...............
[CV]  

[CV]  learning_rate=0.01, max_depth=3, n_estimators=500, total=   8.4s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.2s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.2s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.2s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.4s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.7s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.01, max_depth=4, n_estimators=100, total=   2.7s
[CV] learning_rate=0.01, max_depth=4, n_estimators=100 ...............
[CV]  

[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, total=  15.4s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, total=  17.9s
[CV] learning_rate=0.01, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.01, max_depth=5, n_estimators=500, total=  17.8s
[CV] learning_rate=0.05, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=2, n_estimators=100, total=   1.3s
[CV] learning_rate=0.05, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=2, n_estimators=100, total=   1.3s
[CV] learning_rate=0.05, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=2, n_estimators=100, total=   1.3s
[CV] learning_rate=0.05, max_depth=2, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=2, n_estimators=100, total=   1.3s
[CV] learning_rate=0.05, max_depth=2, n_estimators=100 ...............
[CV]  

[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=   8.9s
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=   9.5s
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=  10.5s
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=   9.8s
[CV] learning_rate=0.05, max_depth=3, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=3, n_estimators=500, total=   8.9s
[CV] learning_rate=0.05, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=4, n_estimators=100, total=   2.3s
[CV] learning_rate=0.05, max_depth=4, n_estimators=100 ...............
[CV]  learning_rate=0.05, max_depth=4, n_estimators=100, total=   2.3s
[CV] learning_rate=0.05, max_depth=4, n_estimators=100 ...............
[CV]  

[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  19.4s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  18.2s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  18.9s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  18.8s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  16.3s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  16.7s
[CV] learning_rate=0.05, max_depth=5, n_estimators=500 ...............
[CV]  learning_rate=0.05, max_depth=5, n_estimators=500, total=  19.4s
[CV] learning_rate=0.1, max_depth=2, n_estimators=100 ................
[CV] .

[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=   9.6s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=   9.9s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=  10.7s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=  10.0s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=   9.0s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=   9.7s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=3, n_estimators=500, total=  10.4s
[CV] learning_rate=0.1, max_depth=3, n_estimators=500 ................
[CV] .

[CV] . learning_rate=0.1, max_depth=5, n_estimators=250, total=   9.1s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  18.6s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  18.9s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  21.3s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  18.8s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  16.8s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] . learning_rate=0.1, max_depth=5, n_estimators=500, total=  20.1s
[CV] learning_rate=0.1, max_depth=5, n_estimators=500 ................
[CV] .

[Parallel(n_jobs=1)]: Done 360 out of 360 | elapsed: 40.4min finished


Best parameters set found on development set: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 100}
Grid scores on development set: -16126389035.176874


In [34]:
def main():
    data_dir = './data/' 
    
    # load data
    print("Loading data...")
    hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']
    imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']
    vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']
    sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']
    social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

    ### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
    pca = PCA(n_components=3)
    hist_feature = pca.fit_transform(hist_feature)
    pca = PCA(n_components=5)
    imgNet_feature = pca.fit_transform(imgNet_feature)
    pca = PCA(n_components=10)
    vSenti_feature = pca.fit_transform(vSenti_feature)
    pca = PCA(n_components=3)
    sen2vec_feature = pca.fit_transform(sen2vec_feature)
    
    # contatenate all the features(after dimension reduction)
    concat_feature = np.concatenate([hist_feature, imgNet_feature, vSenti_feature, sen2vec_feature, social_feature], axis=1) 
    print("The input data dimension is: (%d, %d)" %(concat_feature.shape))
    
    # load ground-truth
    ground_truth = []
    for line in open(os.path.join(data_dir, 'ground_truth.txt')):
        #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
        ground_truth.append(float(line.strip().split('::::')[0])) 
    ground_truth = np.array(ground_truth, dtype=np.float32)
    
    
    print("Start training and predict...")
    kf = KFold(n_splits=10)
    nMSEs = []
    for train, test in kf.split(concat_feature):
        # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
        model = ensemble.GradientBoostingRegressor(learning_rate=0.01, max_depth=4, n_estimators=100)
        # train
        model.fit(concat_feature[train], ground_truth[train])
        # predict
        predicts = model.predict(concat_feature[test])
        # nMSE(normalized Mean Squared Error) metric calculation
        nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
        nMSEs.append(nMSE)
    
        print("This round of nMSE is: %f" %(nMSE))
    
    print('Average nMSE is %f.' %(np.mean(nMSEs)))


if __name__ == "__main__":
    main()


Loading data...
The input data dimension is: (10000, 29)
Start training and predict...
This round of nMSE is: 0.735866
This round of nMSE is: 0.864782
This round of nMSE is: 2.592335
This round of nMSE is: 0.635268
This round of nMSE is: 0.995217
This round of nMSE is: 3.512050
This round of nMSE is: 1.023056
This round of nMSE is: 1.213288
This round of nMSE is: 0.710224
This round of nMSE is: 0.104740
Average nMSE is 1.238683.
