### Installing Dependencies

In [48]:
import os, sys, glob

import matplotlib.pyplot as plt 

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor

from sklearn.linear_model import LinearRegression

In [49]:
test_sets = []
all_files = glob.glob(os.path.join('blogfeedback', '*.csv'))

for filename in all_files:
    df = pd.read_csv(filename)
    test_sets.append(df)

train_data = pd.read_csv('blogData_train.csv')

#### Feature Descriptions

In [50]:
# 1...50: 
#       Average, standard deviation, min, max and median of the 
#       Attributes 51...60 for the source of the current blog post
#       With source we mean the blog on which the post appeared. 
#       For example, myblog.blog.org would be the source of 
#       the post myblog.blog.org/post_2010_09_10 
# 51:   Total number of comments before basetime
# 52:   Number of comments in the last 24 hours before the 
#       basetime
# 53:   Let T1 denote the datetime 48 hours before basetime,
#       Let T2 denote the datetime 24 hours before basetime.
#       This attribute is the number of comments in the time period 
#       between T1 and T2
# 54:   Number of comments in the first 24 hours after the 
#       publication of the blog post, but before basetime
# 55:   The difference of Attribute 52 and Attribute 53
# 56...60: 
#       The same features as the attributes 51...55, but  
#       features 56...60 refer to the number of links (trackbacks), 
#       while features 51...55 refer to the number of comments.
# 61:   The length of time between the publication of the blog post 
#       and basetime
# 62:   The length of the blog post
# 63...262: 
#       The 200 bag of words features for 200 frequent words of the 
#       text of the blog post
# 263...269: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the basetime
# 270...276: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the date of publication of the blog
#       post
# 277:  Number of parent pages: we consider a blog post P as a
#       parent of blog post B, if B is a reply (trackback) to 
#       blog post P.
# 278...280:  
#       Minimum, maximum, average number of comments that the 
#       parents received
# 281:  The target: the number of comments in the next 24 hours
#       (relative to basetime)

### Feature Normalisation

In [51]:
features_to_normalise = train_data.columns[50:60].append(train_data.columns[-1:])

In [52]:
def find_target_stats(df):

    target_means = df.groupby(list(df.columns[:50]))[['target']].mean()
    target_stdev = df.groupby(list(df.columns[:50]))[['target']].std()

    df = pd.merge(df, target_means, how = "left", left_on = list(df.columns[:50]), right_on= list(df.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_avg"})
    df = pd.merge(df, target_stdev, how = "left", left_on = list(df.columns[:50]), right_on= list(df.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_stdev"})

    df = df.fillna(0)

    return df

def normalise_features(features, df):

    normalised_df = df.assign(**{
        feat + '_in_site_stdevs': np.where(
            df[feat + '_stdev'] == 0, 0,
            (df[feat] - df[feat + '_avg']) / df[feat + '_stdev']
        )
        for feat in features_to_normalise
    })

    return normalised_df

In [53]:
train_data = find_target_stats(train_data)
normalised_df = normalise_features(features_to_normalise, train_data)

### Feature Selection

In [55]:

labels = train_data[['target']]

all_feature_names = list(train_data.columns[0:280])
all_features = train_data[all_feature_names]

In [56]:
tree_model = DecisionTreeRegressor(max_depth=20).fit(all_features, labels)

In [68]:
def get_features(tree):
    feature_importances = tree.feature_importances_

    imp_threshold = 0.01
    selected_names = []
    values = []

    idx = 0
    for imp_value in feature_importances:
        if imp_value >= imp_threshold:
            selected_names.append(all_features.columns[idx])
            values.append(imp_value)
        idx += 1
    

    return pd.Series(values, index=selected_names)

In [69]:

selected_names = get_features(tree_model).index
selected_features = train_data[selected_names]
tree_model2 = DecisionTreeRegressor(max_depth=10).fit(selected_features, labels)

print(tree_model2.score(selected_features, labels))

get_features(tree_model).sort_values()




0.8281481689918144


links_1d             0.014003
base_comments        0.015458
comments_diff_avg    0.018061
comments_firstDay    0.021273
comments_diff        0.028433
post_length          0.042299
comments_1d          0.123592
time_since_posted    0.228554
comments_2d_stdev    0.243899
dtype: float64

#### Linear Regression Model

In [70]:
model = LinearRegression(fit_intercept=False)
model.fit(selected_features, labels)

In [76]:
scores = []

val_sets = test_sets[:len(test_sets)//2]

for _set in val_sets:
    # _set = find_target_stats(_set)
    # normalised_set = normalise_features(features_to_normalise, _set)
    
    features = _set[selected_names]
    _labels = _set[['target']]

    scores.append(model.score(features, _labels))

np.mean(scores)

0.16854412014007922