# Imports

In [1]:
import pathlib

In [4]:
import numpy as np
import pandas as pd

# Constants

In [2]:
PROJECT_DIR = pathlib.Path('~/work').expanduser()
DATA_DIR = PROJECT_DIR / 'data'

In [3]:
# https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity
ONPD_DATA_DIR = DATA_DIR / 'OnlineNewsPopularity'

# Example 2-17

Example of interaction features in prediction

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

In [22]:
# Load the Online News Popularity dataset
df = pd.read_csv(
    filepath_or_buffer=ONPD_DATA_DIR / 'OnlineNewsPopularity.csv', 
    delimiter=', ',
    engine='python'
)

In [23]:
df.columns

Index(['url', 'timedelta', 'n_tokens_title', 'n_tokens_content',
       'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
       'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
       'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
     

In [24]:
# Select the content-based features as singleton features in the model,
# skipping over the derived features
features = ['n_tokens_title', 'n_tokens_content',
            'n_unique_tokens', 'n_non_stop_words', 'n_non_stop_unique_tokens',
            'num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos',
            'average_token_length', 'num_keywords', 'data_channel_is_lifestyle',
            'data_channel_is_entertainment', 'data_channel_is_bus',
            'data_channel_is_socmed', 'data_channel_is_tech',
            'data_channel_is_world']

In [26]:
X = df[features]
y = df['shares']

In [27]:
# Create pairwise interaction features, skipping the constant bias term
X2 = PolynomialFeatures(include_bias=False).fit_transform(X)
X2.shape

(39644, 170)

In [28]:
# Create train/test sets for both feature sets
X1_train, X1_test, X2_train, X2_test, y_train, y_test = (
    train_test_split(X, X2, y, test_size=0.3, random_state=123)
)

In [29]:
y_test.shape

(11894,)

In [30]:
def evaluate_feature(X_train, X_test, y_train, y_test):
    '''
    Fit a linear regression model on 
    the training set and score on the test set
    '''
    model = LinearRegression().fit(X_train, y_train)
    r_score = model.score(X_test, y_test)
    return (model, r_score)

In [31]:
(m1, r1) = evaluate_feature(X1_train, X1_test, y_train, y_test)
print("R-squared score with singleton features: %0.5f" % r1)

R-squared score with singleton features: 0.00924


In [32]:
(m2, r2) = evaluate_feature(X2_train, X2_test, y_train, y_test)
print("R-squared score with pairwise features: %0.10f" % r2)

R-squared score with pairwise features: 0.0113301610
