In [49]:
import csv
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_absolute_error
from verstack.stratified_continuous_split import scsplit # pip install verstack
from sklearn.svm import SVR, LinearSVR
from sklearn.kernel_approximation import Nystroem
from sklearn import preprocessing

### data pre-precess

In [3]:
# Load the training data
train_data = pd.read_csv("data/train_transformed.csv")

In [6]:
train_data.head()

Unnamed: 0,id,user_verified,user_statuses_count,user_followers_count,user_friends_count,ratio_friends_followers,mention_exist,mention_count,url_exist,url_count,...,tf_idf_0,tf_idf_1,tf_idf_2,tf_idf_3,tf_idf_4,tf_idf_5,tf_idf_6,tf_idf_7,tf_idf_8,tf_idf_9
0,0,0,68460,1101,1226,1.112523,0,0,0,0,...,-1.606137e-15,-1.992248e-13,-6.077335e-13,-1.726278e-12,7.402787e-13,2.37598e-14,2.692723e-13,5.657295e-13,-2.124336e-14,9.444088e-14
1,1,0,309,51,202,3.884615,0,0,0,0,...,0.01206801,0.002715082,0.01318767,0.03157979,0.02662453,0.01621955,-0.02204838,0.00931604,0.00328826,0.07966161
2,2,0,3241,1675,2325,1.387232,0,0,0,0,...,0.05095493,-0.001131896,0.03852378,0.116344,0.15423,0.2938599,0.4018045,0.190963,0.3218782,-0.04933187
3,3,0,32327,667,304,0.45509,0,0,0,0,...,0.008873053,0.001403727,0.003399734,0.01401861,0.009761796,0.009215406,0.006693422,0.008483338,-0.002502589,0.01261141
4,4,0,581,42,127,2.953488,0,0,0,0,...,0.02774364,-0.001756079,0.00824523,0.03155745,0.0138588,0.005119571,0.003617241,0.01908667,-0.01212654,0.01883319


In [77]:
#features_number=np.append([-12,-13,-14,-15],np.arange(-10,0,1))
features_number=[-10,-9,-8,2,3,4]
print(features_number)

[-10, -9, -8, 2, 3, 4]


In [1]:
def get_features_set(X_train, features_number):
    features = X_train.columns.values.tolist()
    cols = []
    for i in features_number:
        cols.append(features[i])
    return X_train[cols]

In [8]:
# scsplit method is used in order to split our regression data in a stratisfied way and keep a similar distribution of retweet counts between the two sets
X_train_all, X_test_all, y_train, y_test = scsplit(train_data, train_data['retweet_count'], stratify=train_data['retweet_count'], train_size=0.7, test_size=0.3)


In [9]:
print(train_data.columns.values.tolist())

['id', 'user_verified', 'user_statuses_count', 'user_followers_count', 'user_friends_count', 'ratio_friends_followers', 'mention_exist', 'mention_count', 'url_exist', 'url_count', 'hashtag_exist', 'hashtag_count', 'timeseg', 'weekend', 'day_of_week', 'text_length', 'sentiment_pos', 'sentiment_neg', 'sentiment_neu', 'sentiment_comp', 'retweet_count', 'tf_idf_0', 'tf_idf_1', 'tf_idf_2', 'tf_idf_3', 'tf_idf_4', 'tf_idf_5', 'tf_idf_6', 'tf_idf_7', 'tf_idf_8', 'tf_idf_9']


In [82]:
X_train = get_features_set(X_train_all, features_number)
X_test =  get_features_set(X_test_all, features_number )
X_train.head()

Unnamed: 0,tf_idf_0,tf_idf_1,tf_idf_2,user_statuses_count,user_followers_count,user_friends_count
138941,0.032565,0.010268,0.010578,4969,683,905
36838,0.021194,0.002777,0.008542,13654,1395,1716
416454,0.36671,-0.098684,-0.27772,1137,1242,1142
656956,0.034345,-0.011252,0.036042,3169,153,278
639505,0.292932,-0.054718,-0.119719,17500,2170,3654


## MODEL

###  1) SVR


####  1.1 LinearSVR

In [43]:
# import random
# random.seed( 10 )

In [43]:
def linearsvr_predict(X_train, y_train,X_test):
    clf = LinearSVR()
    clf.fit(X_train, y_train)
    return clf.predict(X_test)

In [70]:
X_test.head(10)

Unnamed: 0,tf_idf_0,tf_idf_1,tf_idf_2,user_statuses_count
560815,0.007523,0.00098,0.004954,4147
232971,0.023046,0.000395,-0.008143,13127
143188,0.220172,0.483013,0.160172,10052
103456,0.549239,-0.262622,0.287493,331769
262947,0.011707,0.000347,0.007341,822
141169,0.006097,0.001632,0.007479,23034
324891,0.035267,0.006007,0.041262,7387
124429,0.034262,0.002831,0.024745,5356
503872,0.0,-0.0,-0.0,233281
316368,0.242582,-0.110074,0.126596,11965


In [80]:
scaler = preprocessing.StandardScaler()
X_train["user_statuses_count"] = scaler.fit_transform(np.array(X_train["user_statuses_count"]).reshape(-1,1))
X_test["user_statuses_count"] = scaler.transform(np.array(X_test["user_statuses_count"]).reshape(-1,1))
scaler = preprocessing.StandardScaler()
X_train["user_followers_count"] = scaler.fit_transform(np.array(X_train["user_followers_count"]).reshape(-1,1))
X_test["user_followers_count"] = scaler.transform(np.array(X_test["user_followers_count"]).reshape(-1,1))
scaler = preprocessing.StandardScaler()
X_train["user_friends_count"] = scaler.fit_transform(np.array(X_train["user_friends_count"]).reshape(-1,1))
X_test["user_friends_count"] = scaler.transform(np.array(X_test["user_friends_count"]).reshape(-1,1))
X_test.head(10)

Unnamed: 0,tf_idf_0,tf_idf_1,tf_idf_2,user_statuses_count,user_followers_count,user_friends_count
560815,0.007523,0.00098,0.004954,-0.380154,-0.09516,-0.100657
232971,0.023046,0.000395,-0.008143,-0.289175,-0.09525,-0.106499
143188,0.220172,0.483013,0.160172,-0.320329,-0.095137,-0.121594
103456,0.549239,-0.262622,0.287493,2.93908,0.039672,-0.07521
262947,0.011707,0.000347,0.007341,-0.413841,-0.095387,-0.146059
141169,0.006097,0.001632,0.007479,-0.188804,-0.088916,0.020104
324891,0.035267,0.006007,0.041262,-0.347329,-0.095284,-0.146753
124429,0.034262,0.002831,0.024745,-0.367905,-0.095276,-0.133855
503872,0.0,-0.0,-0.0,1.94127,-0.094108,-0.09765
316368,0.242582,-0.110074,0.126596,-0.300948,-0.092834,0.168279


In [None]:
y_test_pre=linearsvr_predict(X_train, y_train, X_test)
print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_test_pre))

####   1.2 kernel approximation + LinearSVR

In [11]:
def ka_linearsvr_predict(X_train,y_train,X_test):
    # kernel approximation 
    feature_map_nystroem = Nystroem()
    X_train_transformed = feature_map_nystroem.fit_transform(X_train)
    X_test_transformed = feature_map_nystroem.fit_transform(X_test)
    
    # svr
    clf = LinearSVR()
    clf.fit(X_train_transformed, y_train)
    
    return clf.predict(X_test_transformed)

In [59]:
y_test_pre =ka_linearsvr_predict(X_train,y_train,X_test)
print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_test_pre))

KeyboardInterrupt: 

####  1.3 kernel approximation + SGDRegressor

In [15]:
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [16]:
def kasgd(X_train_data,X_test_data, y_train,y_test):
    feature_map_nystroem = Nystroem()
    X_train_transformed = feature_map_nystroem.fit_transform(X_train_data)
    X_test_transformed = feature_map_nystroem.transform(X_test_data)
    reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
    reg.fit(X_train_transformed, y_train)
    y_pre=reg.predict(X_test_transformed)
    print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_pre))

In [18]:
kasgd(X_train,X_test, y_train,y_test)

Prediction error rbf: 331.30064846683445


####  1.4 SGDRegressor

In [19]:
def sgd(X_train_data,X_test_data, y_train,y_test):
    reg = make_pipeline(StandardScaler(),SGDRegressor(max_iter=1000, tol=1e-3))
    reg.fit(X_train_data, y_train)
    y_pre=reg.predict(X_test_data)
    print("Prediction error rbf:", mean_absolute_error(y_true=y_test, y_pred=y_pre))

In [23]:
sgd(X_train_data,X_test_data, y_train,y_test)

Prediction error rbf: 274.69940515841137


### prediction

In [87]:
X_train=get_features_set(train_data, features_number)
y_train = train_data['retweet_count']
eval_data = pd.read_csv("data/evaluation_transformed.csv")
X_val = get_features_set(eval_data, features_number)



y_pred = ka_linearsvr_predict(X_train,y_train,X_val) #################

# Dump the results into a file that follows the required Kaggle template
with open("prediction/svr_predictions.csv", 'w') as f:
    writer = csv.writer(f)
    writer.writerow(["TweetID", "NoRetweets"])
    for index, prediction in enumerate(y_pred):
        writer.writerow([str(eval_data['id'].iloc[index]) , str(int(prediction))])