# CS4242 Lab 3: Viral Item Prediction in Social Networks

In [1]:
import os
import numpy as np
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import scale
from sklearn import linear_model
from sklearn import ensemble

## Single feature regressions
### 1. Image color histogram feature

In [2]:

data_dir = './data/' 

# load data
print("Loading data...")
hist_feature = np.load(data_dir + 'histogram_feature.npz')['arr_0']

# feature dimension reduction: it's up to you to decide the size of reduced dimensions; the main purpose is to reduce the computation complexity
pca = PCA(n_components=3)
hist_feature = pca.fit_transform(hist_feature)

# contatenate all the features(after dimension reduction)
concat_feature = np.concatenate([hist_feature], axis=1)
concat_feature = scale(concat_feature)
print("The input data dimension is: (%d, %d)" %(concat_feature.shape))

# load ground-truth
ground_truth = []
for line in open(os.path.join(data_dir, 'ground_truth.txt')):
    #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
    ground_truth.append(float(line.strip().split('::::')[0])) 
ground_truth = np.array(ground_truth, dtype=np.float32)


print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
pred_hist = np.empty(shape=[0,1])
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR(http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
    # train
    model.fit(concat_feature[train], ground_truth[train])
    # predict
    predicts = model.predict(concat_feature[test])
    pred_hist = np.concatenate((pred_hist,predicts.reshape(1000,1)),axis=0)
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Loading data...
The input data dimension is: (10000, 3)
Start training and predict...




This round of nMSE is: 0.997819
This round of nMSE is: 0.997361
This round of nMSE is: 0.995921
This round of nMSE is: 0.999888
This round of nMSE is: 0.999960
This round of nMSE is: 0.998815
This round of nMSE is: 0.997827
This round of nMSE is: 0.999911
This round of nMSE is: 0.997693
This round of nMSE is: 0.999816
Average nMSE is 0.998501.


### 2. imgNet_feature

In [3]:

data_dir = './data/' 

# load data
print("Loading data...")
imgNet_feature = np.load(data_dir + 'imageNet_feature.npz')['arr_0']

### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
pca = PCA(n_components=5)
imgNet_feature = pca.fit_transform(imgNet_feature)

# contatenate all the features(after dimension reduction)
concat_feature = np.concatenate([imgNet_feature], axis=1) 
concat_feature = scale(concat_feature)
print("The input data dimension is: (%d, %d)" %(concat_feature.shape))

# load ground-truth
ground_truth = []
for line in open(os.path.join(data_dir, 'ground_truth.txt')):
    #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
    ground_truth.append(float(line.strip().split('::::')[0])) 
ground_truth = np.array(ground_truth, dtype=np.float32)


print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
pred_imageNet = np.empty(shape=[0,1])
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
    # train
    model.fit(concat_feature[train], ground_truth[train])
    # predict
    predicts = model.predict(concat_feature[test])
    pred_imageNet = np.concatenate((pred_imageNet,predicts.reshape(1000,1)))
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Loading data...




The input data dimension is: (10000, 5)
Start training and predict...
This round of nMSE is: 0.997906
This round of nMSE is: 0.997687
This round of nMSE is: 0.995760
This round of nMSE is: 0.999906
This round of nMSE is: 0.999957
This round of nMSE is: 0.998760
This round of nMSE is: 0.997861
This round of nMSE is: 0.999908
This round of nMSE is: 0.997613
This round of nMSE is: 0.999818
Average nMSE is 0.998518.


### 3. vSenti_feature

In [4]:

data_dir = './data/' 

# load data
print("Loading data...")
vSenti_feature = np.load(data_dir + 'visual_senti_feature.npz')['arr_0']

### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
pca = PCA(n_components=10)
vSenti_feature = pca.fit_transform(vSenti_feature)

# contatenate all the features(after dimension reduction)
concat_feature = np.concatenate([vSenti_feature], axis=1) 
concat_feature = scale(concat_feature)
print("The input data dimension is: (%d, %d)" %(concat_feature.shape))

# load ground-truth
ground_truth = []
for line in open(os.path.join(data_dir, 'ground_truth.txt')):
    #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
    ground_truth.append(float(line.strip().split('::::')[0])) 
ground_truth = np.array(ground_truth, dtype=np.float32)


print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
pred_vSenti = np.empty(shape=[0,1])
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
    # train
    model.fit(concat_feature[train], ground_truth[train])
    # predict
    predicts = model.predict(concat_feature[test])
    pred_vSenti = np.concatenate((pred_vSenti,predicts.reshape(1000,1)))
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Loading data...




The input data dimension is: (10000, 10)
Start training and predict...
This round of nMSE is: 0.997626
This round of nMSE is: 0.996819
This round of nMSE is: 0.994465
This round of nMSE is: 0.999845
This round of nMSE is: 0.999956
This round of nMSE is: 0.998602
This round of nMSE is: 0.997636
This round of nMSE is: 0.999909
This round of nMSE is: 0.997714
This round of nMSE is: 0.999687
Average nMSE is 0.998226.


### 4. text_sentence2vec_feature

In [5]:

data_dir = './data/' 

# load data
print("Loading data...")
sen2vec_feature = np.load(data_dir + 'text_sentence2vec_feature.npz')['arr_0']

### feature dimension reduction: I tried to reduce dimensions without increasing nMSE ###
pca = PCA(n_components=3)
sen2vec_feature = pca.fit_transform(sen2vec_feature)

# contatenate all the features(after dimension reduction)
concat_feature = np.concatenate([sen2vec_feature], axis=1) 
concat_feature = scale(concat_feature)
print("The input data dimension is: (%d, %d)" %(concat_feature.shape))

# load ground-truth
ground_truth = []
for line in open(os.path.join(data_dir, 'ground_truth.txt')):
    #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
    ground_truth.append(float(line.strip().split('::::')[0])) 
ground_truth = np.array(ground_truth, dtype=np.float32)


print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
pred_s2v = np.empty(shape=[0,1])
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
    # train
    model.fit(concat_feature[train], ground_truth[train])
    # predict
    predicts = model.predict(concat_feature[test])
    pred_s2v = np.concatenate((pred_s2v,predicts.reshape(1000,1)))
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Loading data...
The input data dimension is: (10000, 3)
Start training and predict...




This round of nMSE is: 0.997000
This round of nMSE is: 0.997235
This round of nMSE is: 0.994146
This round of nMSE is: 0.999898
This round of nMSE is: 0.999970
This round of nMSE is: 0.998909
This round of nMSE is: 0.997712
This round of nMSE is: 0.999912
This round of nMSE is: 0.997701
This round of nMSE is: 0.999820
Average nMSE is 0.998230.


### 5. social features (with user description and video description sentiment scores)

In [6]:

def load_social_features(video_id, video_user, user_details, user_des_score, video_des_score):
    vid = [] #video id list
    for line in open(video_id):
        vid.append(line.strip())
   
    vid_uid_dict = {} #vid-uid mapping
    for line in open(video_user):
        data = line.strip().split('::::')
        vid_uid_dict[data[0]] = data[1]
    
    social_features = {} #uid-social_feature mapping
    ### here I add 5 social features: following count of the user, like count of the user, ###
    ### post count of the user, twitter verified flag and user description sentiment score ###
    ### I also add video description sentiment score here. ###
    with open(user_details,encoding='utf-8') as f1, open(user_des_score,encoding='utf-8') as f2, open(video_des_score,encoding='utf-8') as f3:
        data = [line.strip().split("::::") for line in f1]
        scores1 = [line.strip() for line in f2]
        scores2 = [line.strip() for line in f3]
        for i,d in enumerate(data):
            s1 = float(scores1[i])
            s2 = float(scores2[i])
            l = [float(n) for n in d[1:7]]
            l.append(s1)
            l.append(s2)
            social_features[d[0]] = l
    
    res = [] #social_feature vector for each video
    for v in vid:
        try:
            res.append(social_features[vid_uid_dict[v]])
        except:
            #note: there are some users don't have social features, just assgin zero-vector to them
            res.append([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) 

    return np.array(res, dtype=np.float32) 


data_dir = './data/' 

# load data
print("Loading data...")
social_feature = load_social_features(data_dir + 'video_id.txt', data_dir + 'video_user.txt', data_dir + 'user_details.txt',data_dir + 'user_des_scores.txt',data_dir + 'video_des_scores.txt')

# contatenate all the features(after dimension reduction)
concat_feature = np.concatenate([social_feature], axis=1) 
concat_feature = scale(concat_feature)
print("The input data dimension is: (%d, %d)" %(concat_feature.shape))

# load ground-truth
ground_truth = []
for line in open(os.path.join(data_dir, 'ground_truth.txt')):
    #you can use more than one popularity index as ground-truth and average the results; for each video we have four indexes: number of loops(view), likes, reposts, and comments; the first one(loops) is compulsory.
    ground_truth.append(float(line.strip().split('::::')[0])) 
ground_truth = np.array(ground_truth, dtype=np.float32)


print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
pred_social = np.empty(shape=[0,1])
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=5000000, gamma=1e-05, epsilon=0.01, kernel='rbf')
    # train
    model.fit(concat_feature[train], ground_truth[train])
    # predict
    predicts = model.predict(concat_feature[test])
    pred_social = np.concatenate((pred_social,predicts.reshape(1000,1)))
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Loading data...
The input data dimension is: (10000, 8)
Start training and predict...




This round of nMSE is: 0.801686
This round of nMSE is: 0.937534
This round of nMSE is: 0.696242
This round of nMSE is: 0.951435
This round of nMSE is: 0.997843
This round of nMSE is: 0.986784
This round of nMSE is: 0.922323
This round of nMSE is: 0.998810
This round of nMSE is: 0.789873
This round of nMSE is: 0.972943
Average nMSE is 0.905547.


In [7]:
pred_final = np.concatenate((pred_hist,pred_imageNet,pred_vSenti, pred_s2v, pred_social),axis=1)
print(pred_final.shape)

(10000, 5)


## Late fusion
### Linear regression

In [8]:
from sklearn import linear_model

kf = KFold(n_splits=10)
nMSEs = []
for alpha in range(1,100):
    kf = KFold(n_splits=10)
    nMSEs = []
    for train, test in kf.split(pred_final):
        model = linear_model.Ridge(alpha=alpha/10, normalize=True)
        # train
        model.fit(pred_final[train], ground_truth[train])
        # predict
        predicts = model.predict(pred_final[test])
        # nMSE(normalized Mean Squared Error) metric calculation
        nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
        nMSEs.append(nMSE)

    print('Average nMSE is %f.' %(np.mean(nMSEs)))

Average nMSE is 1.802463.
Average nMSE is 1.613501.
Average nMSE is 1.471380.
Average nMSE is 1.362437.
Average nMSE is 1.277583.
Average nMSE is 1.210591.
Average nMSE is 1.157088.
Average nMSE is 1.113933.
Average nMSE is 1.078828.
Average nMSE is 1.050061.
Average nMSE is 1.026341.
Average nMSE is 1.006676.
Average nMSE is 0.990301.
Average nMSE is 0.976614.
Average nMSE is 0.965141.
Average nMSE is 0.955500.
Average nMSE is 0.947388.
Average nMSE is 0.940555.
Average nMSE is 0.934800.
Average nMSE is 0.929956.
Average nMSE is 0.925886.
Average nMSE is 0.922476.
Average nMSE is 0.919629.
Average nMSE is 0.917265.
Average nMSE is 0.915317.
Average nMSE is 0.913728.
Average nMSE is 0.912448.
Average nMSE is 0.911436.
Average nMSE is 0.910656.
Average nMSE is 0.910079.
Average nMSE is 0.909676.
Average nMSE is 0.909427.
Average nMSE is 0.909310.
Average nMSE is 0.909309.
Average nMSE is 0.909409.
Average nMSE is 0.909597.
Average nMSE is 0.909861.
Average nMSE is 0.910192.
Average nMSE

### SVR

In [29]:
pred_final = scale(pred_final)
kf = KFold(n_splits=10)
nMSEs = []
for train, test in kf.split(pred_final):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = SVR(C=100000000, gamma=1e-05, epsilon=0.001, kernel='rbf')
    # train
    model.fit(pred_final[train], ground_truth[train])
    # predict
    predicts = model.predict(pred_final[test])
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))

This round of nMSE is: 0.793356
This round of nMSE is: 0.935642
This round of nMSE is: 0.690246
This round of nMSE is: 0.950290
This round of nMSE is: 0.997754
This round of nMSE is: 0.985787
This round of nMSE is: 0.922504
This round of nMSE is: 0.999042
This round of nMSE is: 0.789533
This round of nMSE is: 0.973909
Average nMSE is 0.903806.


**random forest**

In [15]:

print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = ensemble.RandomForestRegressor(n_estimators=100,max_features=3,random_state=1)
    # train
    model.fit(pred_final[train], ground_truth[train])
    # predict
    predicts = model.predict(pred_final[test])
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Start training and predict...
This round of nMSE is: 0.828973
This round of nMSE is: 1.017093
This round of nMSE is: 13.315153
This round of nMSE is: 0.926004
This round of nMSE is: 0.997865
This round of nMSE is: 2.518203
This round of nMSE is: 5.756129
This round of nMSE is: 1.417853
This round of nMSE is: 0.775454
This round of nMSE is: 0.665593
Average nMSE is 2.821832.


**Gradient boosting**

In [16]:

print("Start training and predict...")
kf = KFold(n_splits=10)
nMSEs = []
for train, test in kf.split(concat_feature):
    # model initialize: you can tune the parameters within SVR (http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html); Or you can select other regression models
    model = ensemble.GradientBoostingRegressor(learning_rate=0.01, max_depth=4, n_estimators=100)
    # train
    model.fit(pred_final[train], ground_truth[train])
    # predict
    predicts = model.predict(pred_final[test])
    # nMSE(normalized Mean Squared Error) metric calculation
    nMSE = mean_squared_error(ground_truth[test], predicts) / np.mean(np.square(ground_truth[test]))
    nMSEs.append(nMSE)

    print("This round of nMSE is: %f" %(nMSE))

print('Average nMSE is %f.' %(np.mean(nMSEs)))


Start training and predict...
This round of nMSE is: 0.866383
This round of nMSE is: 0.951472
This round of nMSE is: 2.755952
This round of nMSE is: 0.921553
This round of nMSE is: 0.996998
This round of nMSE is: 1.768380
This round of nMSE is: 0.969684
This round of nMSE is: 1.519276
This round of nMSE is: 0.792986
This round of nMSE is: 0.946074
Average nMSE is 1.248876.
