### Installing Dependencies

In [16]:
import os, sys

import matplotlib.pyplot as plt 

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

from sklearn.linear_model import LinearRegression

In [17]:
train_data = pd.read_csv('blogData_train.csv')

#### Feature Descriptions

In [18]:
# 1...50: 
#       Average, standard deviation, min, max and median of the 
#       Attributes 51...60 for the source of the current blog post
#       With source we mean the blog on which the post appeared. 
#       For example, myblog.blog.org would be the source of 
#       the post myblog.blog.org/post_2010_09_10 
# 51:   Total number of comments before basetime
# 52:   Number of comments in the last 24 hours before the 
#       basetime
# 53:   Let T1 denote the datetime 48 hours before basetime,
#       Let T2 denote the datetime 24 hours before basetime.
#       This attribute is the number of comments in the time period 
#       between T1 and T2
# 54:   Number of comments in the first 24 hours after the 
#       publication of the blog post, but before basetime
# 55:   The difference of Attribute 52 and Attribute 53
# 56...60: 
#       The same features as the attributes 51...55, but  
#       features 56...60 refer to the number of links (trackbacks), 
#       while features 51...55 refer to the number of comments.
# 61:   The length of time between the publication of the blog post 
#       and basetime
# 62:   The length of the blog post
# 63...262: 
#       The 200 bag of words features for 200 frequent words of the 
#       text of the blog post
# 263...269: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the basetime
# 270...276: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the date of publication of the blog
#       post
# 277:  Number of parent pages: we consider a blog post P as a
#       parent of blog post B, if B is a reply (trackback) to 
#       blog post P.
# 278...280:  
#       Minimum, maximum, average number of comments that the 
#       parents received
# 281:  The target: the number of comments in the next 24 hours
#       (relative to basetime)

### Feature Normalisation

In [19]:
features_to_normalise = train_data.columns[50:60]
'.append(train_data.columns[-1:])' ; features_to_normalise

Index(['base_comments', 'comments_1d', 'comments_2d', 'comments_firstDay',
       'comments_diff', 'base_links', 'links_1d', 'links_2d', 'links_firstDay',
       'links_diff'],
      dtype='object')

In [59]:
train_data = train_data.groupby(list(train_data.columns[:50])).agg({'target': ['mean', 'std']}).reset_index()

# target_avg = train_data.groupby(list(train_data.columns[:50]))[['target']].mean().reset_index()
# target_stdev = train_data.groupby(list(train_data.columns[:50]))[['target']].std().reset_index()


# train_data[['target_avg']] = target_avg[['target']]
# train_data[['target_stdev']] = target_stdev[['target']]

train_data

Unnamed: 0_level_0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,links_firstDay_min,links_firstDay_max,links_firstDay_med,links_diff_avg,links_diff_stdev,links_diff_min,links_diff_max,links_diff_med,target,target
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,...,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,mean,std
0,0.00000,0.00000,0,0,0.0,0.000000,0.00000,0,0,0.0,...,0,0,0.0,0.000000,0.000000,0,0,0.0,0.008228,0.306244
1,0.00000,0.00000,0,0,0.0,0.000000,0.00000,0,0,0.0,...,0,0,0.0,0.033333,0.179506,0,1,0.0,0.000000,0.000000
2,0.00000,0.00000,0,0,0.0,0.000000,0.00000,0,0,0.0,...,0,1,0.0,0.000000,0.235702,-1,1,0.0,0.000000,0.000000
3,0.00000,0.00000,0,0,0.0,0.000000,0.00000,0,0,0.0,...,0,0,0.0,0.000000,0.254000,-1,1,0.0,0.000000,0.000000
4,0.00000,0.00000,0,0,0.0,0.000000,0.00000,0,0,0.0,...,0,1,0.0,0.000000,0.333333,-1,1,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
549,326.16666,138.23102,119,571,308.5,116.416664,142.69983,0,492,65.5,...,1,8,3.5,0.000000,4.020779,-8,8,0.0,22.833333,33.555472
550,336.62130,348.24158,0,1531,140.0,131.751480,200.89548,0,940,59.0,...,0,19,8.0,0.153846,7.725314,-18,19,0.0,54.875740,63.090373
551,546.62990,355.35034,0,2044,509.0,231.590620,227.52120,0,1370,186.0,...,0,26,3.0,0.074778,4.258532,-19,23,0.0,153.803549,199.071450
552,803.33330,559.43260,191,1815,612.0,326.111100,359.53006,0,1062,191.0,...,0,1,0.0,0.111111,0.566558,-1,1,0.0,172.444444,360.492063


In [22]:
normalised_df = train_data.assign(**{
    feat + '_in_site_stdevs': np.where(
        train_data[feat + '_stdev'] == 0, 0,
        (train_data[feat] - train_data[feat + '_avg']) / train_data[feat + '_stdev']
    )
    for feat in features_to_normalise
})

In [23]:
normalised_df

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,base_comments_in_site_stdevs,comments_1d_in_site_stdevs,comments_2d_in_site_stdevs,comments_firstDay_in_site_stdevs,comments_diff_in_site_stdevs,base_links_in_site_stdevs,links_1d_in_site_stdevs,links_2d_in_site_stdevs,links_firstDay_in_site_stdevs,links_diff_in_site_stdevs
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.711379,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.637093,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.637093,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.711379,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.692807,-0.447698,-0.369280,-0.671840,-0.053693,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0.000000,-0.707107,1.414214,0.000000,-1.224745,0.000000,0.000000,0.000000,0.000000,0.000000
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0.000000,-0.707107,-0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.414214,-0.707107,0.000000,1.224745
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.707107,1.414214,0.000000,-1.224745


#### Building Model

In [45]:
#normalised features, base weekday, published weekday, normalised parent page
feature_names_normalised = ['baseDay_Mon', 'baseDay_Tue', 'baseDay_Wed', 'baseDay_Thu', 'baseDay_Fri', 'baseDay_Sat', 'baseDay_Sun', 'pubDay_Mon', 'pubDay_Tue', 'pubDay_Wed', 'pubDay_Thu', 'pubDay_Fri', 'pubDay_Sat', 'pubDay_Sun', 'base_comments_in_site_stdevs','comments_1d_in_site_stdevs','comments_2d_in_site_stdevs','comments_firstDay_in_site_stdevs','comments_diff_in_site_stdevs','base_links_in_site_stdevs','links_1d_in_site_stdevs','links_2d_in_site_stdevs','links_firstDay_in_site_stdevs','links_diff_in_site_stdevs', 'num_parents', 'base_comments_max']
features_normalised = normalised_df[feature_names_normalised]

feature_names = ['baseDay_Mon', 'baseDay_Tue', 'baseDay_Wed', 'baseDay_Thu', 'baseDay_Fri', 'baseDay_Sat', 'baseDay_Sun', 'pubDay_Mon', 'pubDay_Tue', 'pubDay_Wed', 'pubDay_Thu', 'pubDay_Fri', 'pubDay_Sat', 'pubDay_Sun', 'base_comments', 'comments_1d', 'comments_2d', 'comments_firstDay', 'comments_diff', 'base_links', 'links_1d', 'links_2d', 'links_firstDay', 'links_diff', 'num_parents', 'base_comments_max']
features = train_data[list(train_data.columns[:-1])]

#target
labels = normalised_df[['target']]

features

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,pubDay_Tue,pubDay_Wed,pubDay_Thu,pubDay_Fri,pubDay_Sat,pubDay_Sun,num_parents,parent_coments_min,parent_comments_max,parent_comments_avg
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,1,0,0,0,0,0,0,0.0
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,1,0,0,0,0,0,0,0,0.0
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,1,0,0,0,0,0,0,0,0.0
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,1,0,0,0,0,0,0,0.0
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,1,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0,0,0,0.0
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0,0,0,0.0
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,0,1,0,0,0,0,0,0,0.0
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,0,1,0,0,0,0,0,0,0.0


In [46]:
model_n = LinearRegression(fit_intercept=False)
model_n.fit(features_normalised, labels)

In [47]:
pd.Series(model_n.coef_[0], index=feature_names_normalised).sort_values()

links_diff_in_site_stdevs          -12.528009
links_2d_in_site_stdevs             -8.618510
comments_diff_in_site_stdevs        -8.126446
comments_2d_in_site_stdevs          -5.641393
pubDay_Mon                          -3.278011
baseDay_Sat                         -2.978064
base_links_in_site_stdevs           -2.887830
base_comments_in_site_stdevs        -2.836968
baseDay_Fri                         -2.816295
pubDay_Tue                          -2.308533
baseDay_Sun                         -2.197331
pubDay_Thu                          -2.030992
baseDay_Mon                         -1.992444
pubDay_Wed                          -1.779789
baseDay_Thu                         -1.489701
pubDay_Sun                          -1.088971
pubDay_Sat                          -0.841662
pubDay_Fri                          -0.689644
baseDay_Wed                         -0.669921
comments_firstDay_in_site_stdevs    -0.158861
base_comments_max                    0.030577
num_parents                       

In [48]:
normalised_df['pubDay_Sun'].value_counts()

pubDay_Sun
0    47738
1     4659
Name: count, dtype: int64

In [49]:
model_n.score(features_normalised, labels)

0.15397371900133028

In [50]:
model_n.intercept_

0.0

In [51]:
model = LinearRegression(fit_intercept=False)
model.fit(features, labels)
model.score(features, labels)

0.3647604526437357

In [57]:
pd.Series(model.coef_[0], index=list(train_data.columns[:-1])).sort_values()

links_1d_avg            -2.244295e+07
comments_1d_avg         -6.905322e+05
base_comments_min       -3.280702e+02
links_firstDay_min      -5.315072e+01
word155                 -2.949411e+01
                             ...     
comments_firstDay_min    3.280596e+02
comments_2d_avg          6.905334e+05
comments_diff_avg        6.905347e+05
links_diff_avg           2.244294e+07
links_2d_avg             2.244296e+07
Length: 280, dtype: float64