### Installing Dependencies

In [55]:
import os, sys

import matplotlib.pyplot as plt 

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [56]:
train_data = pd.read_csv('blogData_train.csv')
data_no_words = train_data.drop(train_data.loc[:, 'word1':'word200'].columns, axis=1)

#### Feature Descriptions

In [57]:
# 1...50: 
#       Average, standard deviation, min, max and median of the 
#       Attributes 51...60 for the source of the current blog post
#       With source we mean the blog on which the post appeared. 
#       For example, myblog.blog.org would be the source of 
#       the post myblog.blog.org/post_2010_09_10 
# 51:   Total number of comments before basetime
# 52:   Number of comments in the last 24 hours before the 
#       basetime
# 53:   Let T1 denote the datetime 48 hours before basetime,
#       Let T2 denote the datetime 24 hours before basetime.
#       This attribute is the number of comments in the time period 
#       between T1 and T2
# 54:   Number of comments in the first 24 hours after the 
#       publication of the blog post, but before basetime
# 55:   The difference of Attribute 52 and Attribute 53
# 56...60: 
#       The same features as the attributes 51...55, but  
#       features 56...60 refer to the number of links (trackbacks), 
#       while features 51...55 refer to the number of comments.
# 61:   The length of time between the publication of the blog post 
#       and basetime
# 62:   The length of the blog post
# 63...262: 
#       The 200 bag of words features for 200 frequent words of the 
#       text of the blog post
# 263...269: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the basetime
# 270...276: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the date of publication of the blog
#       post
# 277:  Number of parent pages: we consider a blog post P as a
#       parent of blog post B, if B is a reply (trackback) to 
#       blog post P.
# 278...280:  
#       Minimum, maximum, average number of comments that the 
#       parents received
# 281:  The target: the number of comments in the next 24 hours
#       (relative to basetime)

### Feature Normalisation

In [58]:
features_to_normalise = train_data.columns[50:60]; features_to_normalise

Index(['base_comments', 'comments_1d', 'comments_2d', 'comments_firstDay',
       'comments_diff', 'base_links', 'links_1d', 'links_2d', 'links_firstDay',
       'links_diff'],
      dtype='object')

In [82]:
normalised_df = train_data.assign(**{
    feat + '_in_site_stdevs': np.where(
        train_data[feat + '_stdev'] == 0, 0,
        (train_data[feat] - train_data[feat + '_avg']) / train_data[feat + '_stdev']
    )
    for feat in features_to_normalise
})

In [83]:
normalised_df

Unnamed: 0,base_comments_avg,base_comments_stdev,base_comments_min,base_comments_max,base_comments_med,comments_1d_avg,comments_1d_stdev,comments_1d_min,comments_1d_max,comments_1d_med,...,base_comments_in_site_stdevs,comments_1d_in_site_stdevs,comments_2d_in_site_stdevs,comments_firstDay_in_site_stdevs,comments_diff_in_site_stdevs,base_links_in_site_stdevs,links_1d_in_site_stdevs,links_2d_in_site_stdevs,links_firstDay_in_site_stdevs,links_diff_in_site_stdevs
0,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.711379,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
1,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.637093,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
2,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.637093,-0.416873,-0.307959,-0.609953,-0.075345,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
3,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.711379,-0.416873,-0.430601,-0.671840,0.011260,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
4,40.30467,53.845657,0,401,15.0,15.52416,32.441880,0,377,3.0,...,-0.692807,-0.447698,-0.369280,-0.671840,-0.053693,-0.599397,-0.371480,-0.351476,-0.570769,-0.015075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52392,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0.000000,-0.707107,1.414214,0.000000,-1.224745,0.000000,0.000000,0.000000,0.000000,0.000000
52393,33.00000,0.000000,33,33,33.0,11.00000,15.556349,0,33,0.0,...,0.000000,-0.707107,-0.707107,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
52394,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.414214,-0.707107,0.000000,1.224745
52395,0.00000,0.000000,0,0,0.0,0.00000,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.707107,1.414214,0.000000,-1.224745


### Feature Selection

In [132]:
#normalised features, base weekday, published weekday, normalised parent page
feature_names = ['baseDay_Mon', 'baseDay_Tue', 'baseDay_Wed', 'baseDay_Thu', 'baseDay_Fri', 'baseDay_Sat', 'baseDay_Sun', 'pubDay_Mon', 'pubDay_Tue', 'pubDay_Wed', 'pubDay_Thu', 'pubDay_Fri', 'pubDay_Sat', 'pubDay_Sun', 'base_comments_in_site_stdevs','comments_1d_in_site_stdevs','comments_2d_in_site_stdevs','comments_firstDay_in_site_stdevs','comments_diff_in_site_stdevs','base_links_in_site_stdevs','links_1d_in_site_stdevs','links_2d_in_site_stdevs','links_firstDay_in_site_stdevs','links_diff_in_site_stdevs']
features = normalised_df[feature_names]

#target
labels = normalised_df[['target']]


In [133]:
tree_model = DecisionTreeClassifier(max_depth=100).fit(features, labels)

In [143]:
feature_importances = tree_model.feature_importances_

imp_threshold = 0.03
selected_features = []

idx = 0
for imp_value in feature_importances:
    if imp_value >= imp_threshold:
        selected_features.append(features.columns[idx])
    idx += 1

print(len(selected_features))
selected_features

9


['base_comments_in_site_stdevs',
 'comments_1d_in_site_stdevs',
 'comments_2d_in_site_stdevs',
 'comments_firstDay_in_site_stdevs',
 'comments_diff_in_site_stdevs',
 'base_links_in_site_stdevs',
 'links_1d_in_site_stdevs',
 'links_2d_in_site_stdevs',
 'links_firstDay_in_site_stdevs']

In [146]:
selected_features = normalised_df[selected_features]

#### Linear Regression Model

In [75]:
model = LinearRegression(fit_intercept=False)
model.fit(features, labels)

In [76]:
pd.Series(model.coef_[0], index=feature_names).sort_values()

pubDay_Sun                         -2.075010e+12
pubDay_Mon                         -2.075010e+12
pubDay_Sat                         -2.075010e+12
pubDay_Tue                         -2.075010e+12
pubDay_Wed                         -2.075010e+12
pubDay_Thu                         -2.075010e+12
pubDay_Fri                         -2.075010e+12
links_diff_in_site_stdevs          -1.251196e+01
links_2d_in_site_stdevs            -8.615964e+00
comments_diff_in_site_stdevs       -8.196388e+00
comments_2d_in_site_stdevs         -5.671956e+00
base_links_in_site_stdevs          -2.840683e+00
base_comments_in_site_stdevs       -2.821695e+00
comments_firstDay_in_site_stdevs   -1.903839e-01
links_firstDay_in_site_stdevs       1.135773e+00
links_1d_in_site_stdevs             1.160925e+01
comments_1d_in_site_stdevs          1.240195e+01
baseDay_Sat                         2.075010e+12
baseDay_Fri                         2.075010e+12
baseDay_Sun                         2.075010e+12
baseDay_Mon         

In [77]:
normalised_df['pubDay_Sun'].value_counts()

pubDay_Sun
0    47738
1     4659
Name: count, dtype: int64

In [78]:
model.score(features, labels)

0.026427011272592282

In [79]:
model.intercept_

0.0

### Logistic Regression Model

In [147]:
log_reg_model = LogisticRegression().fit(selected_features, labels)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
log_reg_model.score(features, labels)

0.63816630723133