### Installing Dependencies

In [65]:
import os, sys, glob

import matplotlib.pyplot as plt 

import numpy as np 
import pandas as pd
import statsmodels.api as sm
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

In [66]:
test_sets = []
all_files = glob.glob(os.path.join('blogfeedback', '*.csv'))

for filename in all_files:
    df = pd.read_csv(filename)
    test_sets.append(df)

train_data = pd.read_csv('blogData_train.csv')

#### Feature Descriptions

In [67]:
# 1...50: 
#       Average, standard deviation, min, max and median of the 
#       Attributes 51...60 for the source of the current blog post
#       With source we mean the blog on which the post appeared. 
#       For example, myblog.blog.org would be the source of 
#       the post myblog.blog.org/post_2010_09_10 
# 51:   Total number of comments before basetime
# 52:   Number of comments in the last 24 hours before the 
#       basetime
# 53:   Let T1 denote the datetime 48 hours before basetime,
#       Let T2 denote the datetime 24 hours before basetime.
#       This attribute is the number of comments in the time period 
#       between T1 and T2
# 54:   Number of comments in the first 24 hours after the 
#       publication of the blog post, but before basetime
# 55:   The difference of Attribute 52 and Attribute 53
# 56...60: 
#       The same features as the attributes 51...55, but  
#       features 56...60 refer to the number of links (trackbacks), 
#       while features 51...55 refer to the number of comments.
# 61:   The length of time between the publication of the blog post 
#       and basetime
# 62:   The length of the blog post
# 63...262: 
#       The 200 bag of words features for 200 frequent words of the 
#       text of the blog post
# 263...269: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the basetime
# 270...276: binary indicator features (0 or 1) for the weekday
#       (Monday...Sunday) of the date of publication of the blog
#       post
# 277:  Number of parent pages: we consider a blog post P as a
#       parent of blog post B, if B is a reply (trackback) to 
#       blog post P.
# 278...280:  
#       Minimum, maximum, average number of comments that the 
#       parents received
# 281:  The target: the number of comments in the next 24 hours
#       (relative to basetime)

### Feature Normalisation

In [68]:
features_to_normalise = train_data.columns[50:60].append(train_data.columns[-1:])

In [69]:
def find_target_stats(df):

    target_means = df.groupby(list(df.columns[:50]))[['target']].mean()
    target_stdev = df.groupby(list(df.columns[:50]))[['target']].std()

    df = pd.merge(df, target_means, how = "left", left_on = list(df.columns[:50]), right_on= list(df.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_avg"})
    df = pd.merge(df, target_stdev, how = "left", left_on = list(df.columns[:50]), right_on= list(df.columns[:50])).rename(columns={"target_x": "target", "target_y" : "target_stdev"})

    df = df.fillna(0)

    return df

def normalise_features(features, df):

    normalised_df = df.assign(**{
        feat + '_in_site_stdevs': np.where(
            df[feat + '_stdev'] == 0, 0,
            (df[feat] - df[feat + '_avg']) / df[feat + '_stdev']
        )
        for feat in features_to_normalise
    })

    return normalised_df

In [70]:
train_data = find_target_stats(train_data)
normalised_df = normalise_features(features_to_normalise, train_data)

### Feature Selection

In [71]:

labels = normalised_df[['target_in_site_stdevs']]

all_feature_names = list(normalised_df.columns[0:280])
all_features = normalised_df[all_feature_names]

In [72]:
tree_model = DecisionTreeRegressor(max_depth=100).fit(all_features, labels)

In [73]:
feature_importances = tree_model.feature_importances_

imp_threshold = 0.0001
selected_names = []

idx = 0
for imp_value in feature_importances:
    if imp_value >= imp_threshold:
        selected_names.append(all_features.columns[idx])
    idx += 1

In [74]:
selected_features = normalised_df[selected_names]


#### Linear Regression Model

In [75]:
model = LinearRegression(fit_intercept=False)
model.fit(selected_features, labels)

In [76]:
scores = []

for _set in test_sets:
    _set = find_target_stats(_set)
    normalised_set = normalise_features(features_to_normalise, _set)
    
    features = normalised_set[selected_names]
    labels = normalised_set[['target_in_site_stdevs']]

    scores.append(model.score(features, labels))

scores

[0.14489533801176524,
 -0.21981286005189937,
 0.06096158790265349,
 -0.13869174003220097,
 -0.012397359323764734,
 -0.02694332469849936,
 -0.022003685350096447,
 0.026643228796562002,
 0.06864840667470562,
 0.0548814227469282,
 -0.22952689859765796,
 0.12672787124937113,
 -0.04261238183584792,
 0.11727289275139008,
 0.170224335905377,
 -0.11494777839264891,
 -0.029585457489375244,
 -0.02987092106621181,
 0.14678308175102717,
 0.02545689012025365,
 -0.28577523985720754,
 -0.007672961032793024,
 0.14589010030103733,
 0.045223501749205086,
 0.009443466450714366,
 0.04901977732749607,
 0.0874022061144979,
 0.20816499420943568,
 -0.10180705342716379,
 -0.1900787358445275,
 0.07726232786996101,
 0.05156246316645119,
 0.06250462829181402,
 0.08008374610855573,
 0.24703026699099406,
 0.07772116561526854,
 0.15152977161315573,
 0.16213042536856204,
 -0.19339250056029478,
 -0.011289243444313257,
 -0.1853119475158247,
 -0.4317446992555849,
 -0.005488947690429624,
 -0.12984625265181338,
 0.0096963