In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PowerTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error ,r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

In [2]:
#Cargar data set
path = r'./../data/processed/online_news_popularity_clean.csv'
df_clean = pd.read_csv(path)

Splitting the data set

In [3]:
#df_clean['shares'] = 2**df_clean['shares']

X = df_clean.drop('shares', axis=1)
y = df_clean['shares']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

Se aplican transformaciones y escalamiento

In [4]:
# Transformaciones a columnas en X_train
X_train['n_tokens_content'] = np.log1p(X_train['n_tokens_content'])
X_train['num_hrefs'] = np.log1p(X_train['num_hrefs'])
X_train['kw_avg_max'] = np.sqrt(X_train['kw_avg_max'])
X_train['global_subjectivity'] = np.log1p(X_train['global_subjectivity'])
X_train['global_rate_positive_words'] = np.sqrt(X_train['global_rate_positive_words'])

# Columnas con Yeo-Johnson en X_train
pt = PowerTransformer(method='yeo-johnson')

X_train[['global_rate_negative_words', 'rate_positive_words', 'rate_negative_words']] = pt.fit_transform(
    X_train[['global_rate_negative_words', 'rate_positive_words', 'rate_negative_words']]
)
X_train['avg_positive_polarity'] = np.log1p(X_train['avg_positive_polarity'])

# Transformaciones en X_test
X_test['n_tokens_content'] = np.log1p(X_test['n_tokens_content'])
X_test['num_hrefs'] = np.log1p(X_test['num_hrefs'])
X_test['kw_avg_max'] = np.sqrt(X_test['kw_avg_max'])
X_test['global_subjectivity'] = np.log1p(X_test['global_subjectivity'])
X_test['global_rate_positive_words'] = np.sqrt(X_test['global_rate_positive_words'])

# Aplicar Yeo-Johnson en X_test
X_test[['global_rate_negative_words', 'rate_positive_words', 'rate_negative_words']] = pt.transform(
    X_test[['global_rate_negative_words', 'rate_positive_words', 'rate_negative_words']]
)
X_test['avg_positive_polarity'] = np.log1p(X_test['avg_positive_polarity'])


In [5]:
# Escalamiento de las variables numéricas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
df_clean.head()

Unnamed: 0,n_tokens_content,num_hrefs,average_token_length,kw_avg_max,global_subjectivity,global_sentiment_polarity,global_rate_positive_words,global_rate_negative_words,rate_positive_words,rate_negative_words,...,weekday_is_thursday,weekday_is_friday,weekday_is_saturday,weekday_is_sunday,LDA_00,LDA_01,LDA_02,LDA_03,LDA_04,shares
0,219.0,4.0,4.680365,0.0,0.521617,0.092562,0.045662,0.013699,0.769231,0.230769,...,0.0,0.0,0.0,0.0,0.500331,0.378279,0.040005,0.041263,0.040123,9.211888
1,255.0,3.0,4.913725,0.0,0.341246,0.148948,0.043137,0.015686,0.733333,0.266667,...,0.0,0.0,0.0,0.0,0.799756,0.050047,0.050096,0.050101,0.050001,9.473706
2,211.0,3.0,4.393365,0.0,0.702222,0.323333,0.056872,0.009479,0.857143,0.142857,...,0.0,0.0,0.0,0.0,0.217792,0.033334,0.033351,0.033334,0.682188,10.550747
3,531.0,9.0,4.404896,0.0,0.42985,0.100705,0.041431,0.020716,0.666667,0.333333,...,0.0,0.0,0.0,0.0,0.028573,0.4193,0.494651,0.028905,0.028572,10.228819
4,1072.0,19.0,4.682836,0.0,0.513502,0.281003,0.074627,0.012127,0.860215,0.139785,...,0.0,0.0,0.0,0.0,0.028633,0.028794,0.028575,0.028572,0.885427,8.98014


Training de Model

In [7]:
#Usaremos GridSearch para ver si podemos encontrar un mejor modelo que con los parametros por default
parameters={"max_depth" : [5,10,30],
           "min_samples_leaf":[1,10],
           "max_leaf_nodes":[2,8,16] 
           }

reg_tree = DecisionTreeRegressor(random_state=42)

grid_search = GridSearchCV(estimator = reg_tree,  
                           param_grid = parameters,
                           scoring='neg_mean_squared_error',
                           cv = 5,
                           verbose=0)

grid_search.fit(X_train_scaled, y_train)

In [8]:
grid_search.best_params_

{'max_depth': 10, 'max_leaf_nodes': 16, 'min_samples_leaf': 10}

In [9]:
grid_search.best_estimator_

In [10]:
grid_search.best_score_

-1.6580221508969806

In [11]:

tree_Reg = DecisionTreeRegressor(max_depth=10, max_leaf_nodes=16, min_samples_leaf=10,
                      random_state=42)
tree_Reg.fit(X_train_scaled, y_train)

preds = tree_Reg.predict(X_train_scaled)
mse = mean_squared_error(y_train, preds)
print(f"Error Cuadrático Medio: {mse}")

Error Cuadrático Medio: 1.6473395849486057


In [12]:
preds = tree_Reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, preds)
print(f"Error Cuadrático Medio: {mse}")

Error Cuadrático Medio: 1.6965219715495186


In [13]:
preds

array([11.14686195, 11.18692628, 10.51281926, ..., 10.75574398,
       10.72794748, 11.18692628])

In [14]:
pd.DataFrame([preds, y_test]).T

Unnamed: 0,0,1
0,11.146862,10.103288
1,11.186926,10.451211
2,10.512819,9.659996
3,11.146862,10.344296
4,10.755744,10.965784
...,...,...
7924,11.146862,11.853310
7925,10.755744,11.643856
7926,10.755744,11.287712
7927,10.727947,9.105909


In [15]:
pd.DataFrame([2**preds, 2**y_test]).T

Unnamed: 0,0,1
0,2267.460881,1100.0
1,2331.311833,1400.0
2,1461.079798,809.0
3,2267.460881,1300.0
4,1729.026163,2000.0
...,...,...
7924,2267.460881,3700.0
7925,1729.026163,3200.0
7926,1729.026163,2500.0
7927,1696.031771,551.0


In [16]:

tree_Reg = RandomForestRegressor()
tree_Reg.fit(X_train_scaled, y_train)

preds = tree_Reg.predict(X_train_scaled)
mse = mean_squared_error(y_train, preds)
print(f"Error Cuadrático Medio: {mse}")

Error Cuadrático Medio: 0.22702768615784408


In [17]:
preds = tree_Reg.predict(X_test_scaled)
mse = mean_squared_error(y_test, preds)
print(f"Error Cuadrático Medio: {mse}")

Error Cuadrático Medio: 1.6618155539436172


In [18]:
pd.DataFrame([preds, y_test]).T

Unnamed: 0,0,1
0,11.416786,10.103288
1,11.272103,10.451211
2,10.488684,9.659996
3,11.414671,10.344296
4,11.230351,10.965784
...,...,...
7924,10.983233,11.853310
7925,11.098595,11.643856
7926,10.898671,11.287712
7927,11.102587,9.105909


In [19]:
pd.DataFrame([2**preds, 2**y_test]).T

Unnamed: 0,0,1
0,2733.978273,1100.0
1,2473.097635,1400.0
2,1436.840212,809.0
3,2729.973735,1300.0
4,2402.550946,2000.0
...,...,...
7924,2024.336201,3700.0
7925,2192.855085,3200.0
7926,1909.091788,2500.0
7927,2198.931657,551.0
