### Installing Dependencies

In [279]:
import os, sys

import matplotlib.pyplot as plt 

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [280]:
train_data = pd.read_csv('blogData_train.csv')
data_no_words = train_data.drop(train_data.loc[:, 'word1':'word200'].columns, axis=1)

train_data

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,pubDay_Wed,pubDay_Thu,pubDay_Fri,pubDay_Sat,pubDay_Sun,num_parents,parent_coments_min,parent_comments_max,parent_comments_avg,target
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,1,0,0,0,0,0,0,0.0,1
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,1,0,0,0,0,0,0,0,0.0,0
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,1,0,0,0,0,0,0,0,0.0,0
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,1,0,0,0,0,0,0,0.0,1
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,1,0,0,0,0,0,0,0.0,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0,0,0.0,0
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0,0,0.0,0
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,1,0,0,0,0,0,0,0.0,0
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,1,0,0,0,0,0,0,0.0,0


#### Feature Descriptions

In [281]:
# 1...50: 
#       Average, standard deviation, min, max and median of the 
#       Attributes 51...60 for the source of the current blog post
#       With source we mean the blog on which the post appeared. 
#       For example, myblog.blog.org would be the source of 
#       the post myblog.blog.org/post_2010_09_10 
# 51:   Total number of comments before basetime
# 52:   Number of comments in the last 24 hours before the 
#       basetime
# 53:   Let T1 denote the datetime 48 hours before basetime,
#       Let T2 denote the datetime 24 hours before basetime.
#       This attribute is the number of comments in the time period 
#       between T1 and T2
# 54:   Number of comments in the first 24 hours after the 
#       publication of the blog post, but before basetime
# 55:   The difference of Attribute 52 and Attribute 53
# 56...60: 
#       The same features as the attributes 51...55, but  
#       features 56...60 refer to the number of links (trackbacks), 
#       while features 51...55 refer to the number of comments.
# 61:   The length of time between the publication of the blog post 
#       and basetime
# 62:   The length of the blog post
# 63...262: 
#       The 200 bag of words features for 200 frequent words of the 
#       text of the blog post
# 263...269: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the basetime
# 270...276: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the date of publication of the blog
#       post
# 277:  Number of parent pages: we consider a blog post P as a
#       parent of blog post B, if B is a reply (trackback) to 
#       blog post P.
# 278...280:  
#       Minimum, maximum, average number of comments that the 
#       parents received
# 281:  The target: the number of comments in the next 24 hours
#       (relative to basetime)

### Feature Normalisation

In [282]:
features_to_normalise = train_data.columns[50:60].append(train_data.columns[-1:]); features_to_normalise

Index(['base_comments', 'comments_1d', 'comments_2d', 'comments_firstDay',
       'comments_diff', 'base_links', 'links_1d', 'links_2d', 'links_firstDay',
       'links_diff', 'target'],
      dtype='object')

In [297]:
target_means = train_data.groupby(list(train_data.columns[:50]))[['target']].mean()
target_stdev = train_data.groupby(list(train_data.columns[:50]))[['target']].std()


In [284]:
train_data = pd.merge(train_data, target_means, how = "left", left_on = list(train_data.columns[:50]), right_on= list(train_data.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_avg"})
train_data = pd.merge(train_data, target_stdev, how = "left", left_on = list(train_data.columns[:50]), right_on= list(train_data.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_stdev"})
train_data

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,pubDay_Fri,pubDay_Sat,pubDay_Sun,num_parents,parent_coments_min,parent_comments_max,parent_comments_avg,target,target_avg,target_stdev
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,0,0,0,0,0.0,1,5.660115,13.875971
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,0,0,0,0,0.0,0,5.660115,13.875971
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,0,0,0,0,0.0,0,5.660115,13.875971
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,0,0,0,0,0.0,1,5.660115,13.875971
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,0,0,0,0,0,0,0.0,27,5.660115,13.875971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0.0,0,0.000000,0.000000
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0,0,0,0,0,0,0.0,0,0.000000,0.000000
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,0,0,0,0,0,0.0,0,0.000000,0.000000
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0,0,0,0,0,0,0.0,0,0.000000,0.000000


In [285]:
normalised_df = train_data.assign(**{
    feat + '_in_site_stdevs': np.where(
        train_data[feat + '_stdev'] == 0, 0,
        (train_data[feat] - train_data[feat + '_avg']) / train_data[feat + '_stdev']
    )
    for feat in features_to_normalise
})

normalised_df

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,comments_1d_in_site_stdevs,comments_2d_in_site_stdevs,comments_firstDay_in_site_stdevs,comments_diff_in_site_stdevs,base_links_in_site_stdevs,links_1d_in_site_stdevs,links_2d_in_site_stdevs,links_firstDay_in_site_stdevs,links_diff_in_site_stdevs,target_in_site_stdevs
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075,-0.335841
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075,-0.407908
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075,-0.407908
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075,-0.335841
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.447698,-0.369280,-0.671840,-0.053693,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075,1.537902
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,-0.707107,1.414214,0.000000,-1.224745,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,-0.707107,-0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,1.414214,-0.707107,0.000000,1.224745,0.000000
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,-0.707107,1.414214,0.000000,-1.224745,0.000000


### Feature Selection

In [303]:
#normalised features, base weekday, published weekday, normalised parent page
feature_names = ['baseDay_Mon', 'baseDay_Tue', 'baseDay_Wed', 'baseDay_Thu', 'baseDay_Fri', 'baseDay_Sat', 'baseDay_Sun', 'pubDay_Mon', 'pubDay_Tue', 'pubDay_Wed', 'pubDay_Thu', 'pubDay_Fri', 'pubDay_Sat', 'pubDay_Sun', 'base_comments_in_site_stdevs','comments_1d_in_site_stdevs','comments_2d_in_site_stdevs','comments_firstDay_in_site_stdevs','comments_diff_in_site_stdevs','base_links_in_site_stdevs','links_1d_in_site_stdevs','links_2d_in_site_stdevs','links_firstDay_in_site_stdevs','links_diff_in_site_stdevs']
features = normalised_df[feature_names]

#target
labels = normalised_df[['target_in_site_stdevs']]


In [304]:
tree_model = DecisionTreeClassifier(max_depth=100).fit(features, labels)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
feature_importances = tree_model.feature_importances_

imp_threshold = 0.03
selected_names = []

idx = 0
for imp_value in feature_importances:
    if imp_value >= imp_threshold:
        selected_names.append(features.columns[idx])
    idx += 1

print(len(selected_names))
selected_names

8


['base_comments_in_site_stdevs',
 'comments_1d_in_site_stdevs',
 'comments_2d_in_site_stdevs',
 'comments_firstDay_in_site_stdevs',
 'comments_diff_in_site_stdevs',
 'links_1d_in_site_stdevs',
 'links_2d_in_site_stdevs',
 'links_firstDay_in_site_stdevs']

In [None]:
selected_features = normalised_df[selected_names]

#### Linear Regression Model

In [305]:
model = LinearRegression(fit_intercept=False)
model.fit(features, labels)

In [307]:
pd.Series(model.coef_[0], index=feature_names).sort_values()

pubDay_Mon                         -1.094886e+11
pubDay_Thu                         -1.094886e+11
pubDay_Tue                         -1.094886e+11
pubDay_Wed                         -1.094886e+11
pubDay_Sun                         -1.094886e+11
pubDay_Fri                         -1.094886e+11
pubDay_Sat                         -1.094886e+11
comments_1d_in_site_stdevs         -2.080223e-01
base_comments_in_site_stdevs       -1.960512e-01
base_links_in_site_stdevs          -7.268000e-02
links_1d_in_site_stdevs            -6.844330e-02
links_firstDay_in_site_stdevs       5.880833e-02
links_2d_in_site_stdevs             8.823976e-02
comments_firstDay_in_site_stdevs    1.261516e-01
links_diff_in_site_stdevs           1.474857e-01
comments_2d_in_site_stdevs          5.571954e-01
comments_diff_in_site_stdevs        7.465429e-01
baseDay_Sat                         1.094886e+11
baseDay_Fri                         1.094886e+11
baseDay_Sun                         1.094886e+11
baseDay_Thu         

In [308]:
normalised_df['pubDay_Sun'].value_counts()

pubDay_Sun
0    47738
1     4659
Name: count, dtype: int64

In [309]:
model.score(features, labels)

0.1088349132407268

In [310]:
model.intercept_

0.0

### Logistic Regression Model

In [311]:
log_reg_model = LogisticRegression().fit(selected_features, labels)

  y = column_or_1d(y, warn=True)


ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
log_reg_model.score(features, labels)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- baseDay_Fri
- baseDay_Mon
- baseDay_Sat
- baseDay_Sun
- baseDay_Thu
- ...
