In [2]:
import csv
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_approximation import Nystroem

### data pre-precess

In [3]:
# Load the training data
train_data = pd.read_csv("data/train_transformed.csv")

In [6]:
train_data.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14
1,1,0,309,51,202,3.884615,0,0,0,0,...,0.01206801,0.002715082,0.01318767,0.03157979,0.02662453,0.01621955,-0.02204838,0.00931604,0.00328826,0.07966161
2,2,0,3241,1675,2325,1.387232,0,0,0,0,...,0.05095493,-0.001131896,0.03852378,0.116344,0.15423,0.2938599,0.4018045,0.190963,0.3218782,-0.04933187
3,3,0,32327,667,304,0.45509,0,0,0,0,...,0.008873053,0.001403727,0.003399734,0.01401861,0.009761796,0.009215406,0.006693422,0.008483338,-0.002502589,0.01261141
4,4,0,581,42,127,2.953488,0,0,0,0,...,0.02774364,-0.001756079,0.00824523,0.03155745,0.0138588,0.005119571,0.003617241,0.01908667,-0.01212654,0.01883319


In [7]:
features_number=np.append([-12,-13,-14,-15],np.arange(-10,0,1))
print(features_number)

[-12 -13 -14 -15 -10  -9  -8  -7  -6  -5  -4  -3  -2  -1]


In [1]:
def get_features_set(X_train, features_number):
    features = X_train.columns.values.tolist()
    cols = []
    for i in features_number:
        cols.append(features[i])
    return X_train[cols]

In [8]:
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train_all, X_test_all, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)


In [9]:
print(train_data.columns.values.tolist())

['id', 'user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'mention_count', 'url_exist', 'url_count', 'hashtag_exist', 'hashtag_count', 'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu', 'sentiment_comp', 'retweet_count', 'tf_idf_0', 'tf_idf_1', 'tf_idf_2', 'tf_idf_3', 'tf_idf_4', 'tf_idf_5', 'tf_idf_6', 'tf_idf_7', 'tf_idf_8', 'tf_idf_9']


In [10]:
X_train = get_features_set(X_train_all, features_number)
X_test =  get_features_set(X_test_all, features_number )
X_train.head()

Unnamed: 0,sentiment_comp,sentiment_neu,sentiment_neg,sentiment_pos,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
138941,-0.431,0.701,0.199,0.1,0.032565,0.010268,0.010578,0.034722,0.012313,0.019881,0.033847,0.049984,0.05101,-0.030254
36838,0.9561,0.424,0.0,0.576,0.021194,0.002777,0.008542,0.029635,0.016377,0.018611,-0.000308,0.047276,0.034843,0.04482
416454,0.875,0.744,0.0,0.256,0.36671,-0.098684,-0.27772,0.231829,-0.117851,-0.035987,-0.018876,-0.073731,0.051102,-0.034062
656956,-0.5574,0.455,0.545,0.0,0.034345,-0.011252,0.036042,0.038673,0.058022,0.117585,0.050072,0.570273,0.761369,-0.170455
639505,0.4939,0.556,0.0,0.444,0.292932,-0.054718,-0.119719,-0.132251,0.042396,0.014522,0.001158,-0.001084,0.000531,0.007751


## MODEL

###  1) SVR

####   1.1 kernel approximation + LinearSVR

In [11]:
def ka_linearsvr_predict(X_train,y_train,X_test):
    # kernel approximation 
    feature_map_nystroem = Nystroem()
    X_train_transformed = feature_map_nystroem.fit_transform(X_train)
    X_test_transformed = feature_map_nystroem.fit_transform(X_test)
    
    # svr
    clf = LinearSVR()
    clf.fit(X_train_transformed, y_train)
    
    return clf.predict(X_test_transformed)

In [12]:
y_test_pre =ka_linearsvr_predict(X_train,y_train,X_test)
print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_test_pre))

Prediction error rbf: 148.85877112174524


####  1.2 LinearSVR

In [43]:
# import random
# random.seed( 10 )

In [13]:
def linearsvr_predict(X_train, y_train,X_test):
    clf = LinearSVR()
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

In [14]:
y_test_pre=linearsvr_predict(X_train, y_train, X_test)
print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_test_pre))

Prediction error rbf: 145.98676045500395


####  1.3 kernel approximation + SGDRegressor

In [15]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [16]:
def kasgd(X_train_data,X_test_data, y_train,y_test):
    feature_map_nystroem = Nystroem()
    X_train_transformed = feature_map_nystroem.fit_transform(X_train_data)
    X_test_transformed = feature_map_nystroem.transform(X_test_data)
    reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
    reg.fit(X_train_transformed, y_train)
    y_pre=reg.predict(X_test_transformed)
    print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_pre))

In [18]:
kasgd(X_train,X_test, y_train,y_test)

Prediction error rbf: 331.30064846683445


####  1.4 SGDRegressor

In [19]:
def sgd(X_train_data,X_test_data, y_train,y_test):
    reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
    reg.fit(X_train_data, y_train)
    y_pre=reg.predict(X_test_data)
    print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_pre))

In [23]:
sgd(X_train_data,X_test_data, y_train,y_test)

Prediction error rbf: 274.69940515841137


### prediction

In [87]:
X_train=get_features_set(train_data, features_number)
y_train = train_data['retweet_count']
eval_data = pd.read_csv("data/evaluation_transformed.csv")
X_val = get_features_set(eval_data, features_number)



y_pred = ka_linearsvr_predict(X_train,y_train,X_val) #################

# Dump the results into a file that follows the required Kaggle template
with open("prediction/svr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])