We are going to use a sample of the [Mashable Online News Dataset](https://archive.ics.uci.edu/ml/datasets/Online+News+Popularity). This dataset summarizes a heterogeneous set of features about articles published by Mashable in a period of two years. 

The goal is to predict the number of shares in social networks (**target variable is "shares"**).

In [32]:
from IPython.display import Image
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import warnings
warnings.simplefilter("ignore")
%matplotlib inline

matplotlib.rcParams['figure.figsize'] = [14, 14]
import pandas as pd
news = pd.read_csv("./data/news.csv")

In [2]:
news.shape

(10000, 61)

In [3]:
news.head()

Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares
0,http://mashable.com/2014/09/08/safest-cabbies-...,121.0,12.0,1015.0,0.422018,1.0,0.545031,10.0,6.0,33.0,...,0.1,0.8,-0.160714,-0.5,-0.071429,0.0,0.0,0.5,0.0,2900
1,http://mashable.com/2013/07/25/3d-printed-rifle/,532.0,9.0,503.0,0.569697,1.0,0.737542,9.0,0.0,1.0,...,0.136364,1.0,-0.1575,-0.25,-0.1,0.0,0.0,0.5,0.0,1300
2,http://mashable.com/2013/10/30/digital-dinosau...,435.0,9.0,232.0,0.646018,1.0,0.748428,12.0,3.0,4.0,...,0.375,0.5,-0.4275,-1.0,-0.1875,0.0,0.0,0.5,0.0,17700
3,http://mashable.com/2014/08/27/homer-simpson-i...,134.0,12.0,171.0,0.722892,1.0,0.867925,9.0,5.0,0.0,...,0.5,0.5,-0.216667,-0.25,-0.166667,0.4,-0.25,0.1,0.25,1500
4,http://mashable.com/2013/01/10/creepy-robotic-...,728.0,11.0,286.0,0.652632,1.0,0.8,5.0,2.0,0.0,...,0.1,0.6,-0.251786,-0.5,-0.1,0.2,-0.1,0.3,0.1,1400


This dataset has a lot of features, we will try to find a way to reduce model complexity and make sure that we are not overfitting.

## Train a Support Vector Machine and a Random Forest Regressor with the target variable "shares" and evaluate their performance in the train and the test set by using the function `cross_validate`. Do any of them overfit?

**hint**: you can use the test score / train score ratio as a  benchmark to check if how model is overfitting.

In [4]:
news = news.drop('url', axis = 1)
target_variable = 'shares'
independent_variables = news.drop('shares', axis = 1).columns

news_X = news[independent_variables]
news_y = news[target_variable]

In [5]:
from sklearn.model_selection import train_test_split

news_X_train, news_X_test, news_y_train, news_y_test = train_test_split(
    news_X, news_y, test_size=0.2)

In [6]:
##SVM 
from sklearn.svm import SVC, SVR

In [7]:
estimator_svm =  SVR()

In [8]:
estimator_svm.fit(news_X_train, news_y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [9]:
estimator_svm.predict(news_X_test)[:10]

array([1400.62786885, 1400.62786885, 1400.62786885, 1400.62786885,
       1400.62786885, 1400.62786885, 1400.62786885, 1400.62786885,
       1400.62786885, 1400.62786885])

In [10]:
##Random Forest
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

In [11]:
estimator_randomforest = RandomForestRegressor(n_estimators=100)

estimator_randomforest.fit(news_X_train, news_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [12]:
estimator_randomforest.predict(news_X_test)[:10]

array([2295.32, 2709.48, 3429.32, 1090.18, 5777.9 , 3064.05, 3676.64,
       1346.63, 4059.43, 5443.89])

In [13]:
## Cross Validation
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import mean_squared_error

In [14]:
def evaluate_model(estimator, X, y):
    return cross_validate(estimator, X, y,
                     scoring="neg_mean_absolute_error", cv=3,
                     return_train_score=True)

In [15]:
evaluate_model(estimator_svm, news_X, news_y)

{'fit_time': array([3.17171097, 3.15403199, 3.16774893]),
 'score_time': array([1.21556234, 1.22620201, 1.20456123]),
 'test_score': array([-2382.82543133, -2510.31345977, -2384.36421056]),
 'train_score': array([-2446.36275195, -2382.61577037, -2445.58302949])}

In [16]:
evaluate_model(estimator_randomforest, news_X, news_y)

{'fit_time': array([24.32005382, 23.43662167, 24.55087781]),
 'score_time': array([0.09212017, 0.08886409, 0.09134507]),
 'test_score': array([-3573.65161968, -3410.36673267, -3324.79274227]),
 'train_score': array([-1274.66237474, -1262.16902355, -1291.87248838])}

## Use Feature Selection to reduce the fit time to train a Support Vector Machine while keeping its performance.


In [17]:
from sklearn.feature_selection import SelectKBest, f_regression

In [21]:
scores_selection_kbest10 = zip(news_X.columns, 
                                     selector_kbest10.scores_,
                                     selector_kbest10.get_support())
evaluation_kbest10 = sorted(
                filter(lambda c: c[2]==True,
                       scores_selection_kbest10),
                key=lambda c: c[1], reverse=True
)

In [22]:
list(evaluation_kbest10)

[('kw_avg_avg', 201.42559765766723, True),
 ('self_reference_avg_sharess', 96.84998477251476, True),
 ('self_reference_min_shares', 89.06927387244505, True),
 ('kw_max_avg', 85.54348730306499, True),
 ('LDA_03', 71.20914228157332, True),
 ('self_reference_max_shares', 64.6013802107081, True),
 ('kw_avg_max', 35.7738608270794, True),
 ('LDA_02', 31.932397837693053, True),
 ('avg_negative_polarity', 31.23316044186978, True),
 ('kw_min_avg', 26.02983654419138, True)]

## Using Nested Cross Validation, find the best estimator that you can, choosing between an SVR an a RandomForestRegressor.

In [26]:
selector_kbest50 = SelectKBest(f_regression, k=50)
news_X_kbest50 = selector_kbest50.fit_transform(news_X, news_y)

In [33]:
RESULTS = {}

def display_results(results):
    results_df  = pd.DataFrame(results).T
    results_cols = results_df.columns
    for col in results_df:
        results_df[col] = results_df[col].apply(np.mean)
        results_df[col+"_idx"] = results_df[col] / results_df[col].min()
    return results_df

In [29]:
RESULTS["rf_kbest_50"] = evaluate_model(RandomForestRegressor(), news_X_kbest50, news_y)
RESULTS["svr_kbest_50"] = evaluate_model(SVR(), news_X_kbest50, news_y)

In [35]:
RESULTS["rf_kbest_10"] = evaluate_model(RandomForestRegressor(),
                                           news_X_kbest10, news_y)
RESULTS["svr_kbest_10"] = evaluate_model(SVR(), 
                                            news_X_kbest10, news_y)

In [36]:
display_results(RESULTS)

Unnamed: 0,fit_time,score_time,test_score,train_score,fit_time_idx,score_time_idx,test_score_idx,train_score_idx
rf_kbest_10,1.276139,0.02108,-3424.273608,-1350.605067,1.0,1.0,1.0,0.556984
svr_kbest_10,2.613067,0.681751,-2425.834733,-2424.852252,2.047636,32.340981,0.708423,1.0


## For the best SVR you find, which points in the dataset are the hardest ones to classify?