To use these functions in another file, add the following:
%run preprocess.ipynb

In [2]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages/')

from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
online_news_popularity = fetch_ucirepo(id=332)
X = online_news_popularity.data.features
y = online_news_popularity.data.targets

In [4]:
X.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


In [5]:
def get_raw_data():
    return X, y

In [6]:
# We're going to need this function when we process an article to input to the model
def get_scaler(df):
    scaler = StandardScaler()
    scaler.fit(df)

    return scaler

In [7]:
def normalize_one(df, verbose=False):
    print(type(df))
    columns = df.columns
    scaler = get_scaler(df)
    if verbose:
        print('before:')
        print(df.mean(axis=0))
    df = scaler.transform(df)
    if verbose:
        print('after:')
        print(df.mean(axis=0))
    print(type(df))

    
    return df

In [8]:
def normalize(X, y, verbose=False):
    X = normalize_one(X, verbose)
    y = normalize_one(y, verbose)

    return X, y


In [9]:
def split(X, y, test_size, val_size):
    # first we split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
    # then we split the training set into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(1-test_size), random_state=1)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [19]:
def get_processed_data(test_size=0.1, val_size=0.1, verbose=False):
    X, y = get_raw_data()
    columns = [x.strip() for x in X.columns]
    X.columns = columns
    X, y = filter_features(X, y, ['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs', 'num_imgs',
       'num_videos', 'average_token_length', 'weekday_is_monday',
       'weekday_is_tuesday', 'weekday_is_wednesday', 'weekday_is_thursday',
       'weekday_is_friday', 'weekday_is_saturday', 'weekday_is_sunday',
       'is_weekend', 'global_subjectivity', 'global_sentiment_polarity',
       'global_rate_positive_words', 'global_rate_negative_words',
       'rate_positive_words', 'rate_negative_words', 'avg_positive_polarity',
       'max_positive_polarity', 'avg_negative_polarity',
       'min_negative_polarity', 'title_subjectivity',
       'title_sentiment_polarity', 'abs_title_sentiment_polarity', "LDA_00", "LDA_01", "LDA_02", "LDA_03", "LDA_04"])
    X, y = normalize(X, y, verbose)
    X_train, y_train, X_val, y_val, X_test, y_test = split(X, y, test_size, val_size)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [20]:
def filter_features(X, y, features):
    print(f"colums: {X.columns}")
    print(f"features: {features}")
    X = X[features]
    return X, y

In [21]:
X_train, y_train, X_val, y_val, X_test, y_test = get_processed_data(test_size=0.2, val_size=0.3)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(f"targets: {y_train}")
print(f"data: {X_train}")

colums: Index(['n_tokens_title', 'n_tokens_content', 'n_unique_tokens',
       'n_non_stop_words', 'n_non_stop_unique_tokens', 'num_hrefs',
       'num_self_hrefs', 'num_imgs', 'num_videos', 'average_token_length',
       'num_keywords', 'data_channel_is_lifestyle',
       'data_channel_is_entertainment', 'data_channel_is_bus',
       'data_channel_is_socmed', 'data_channel_is_tech',
       'data_channel_is_world', 'kw_min_min', 'kw_max_min', 'kw_avg_min',
       'kw_min_max', 'kw_max_max', 'kw_avg_max', 'kw_min_avg', 'kw_max_avg',
       'kw_avg_avg', 'self_reference_min_shares', 'self_reference_max_shares',
       'self_reference_avg_sharess', 'weekday_is_monday', 'weekday_is_tuesday',
       'weekday_is_wednesday', 'weekday_is_thursday', 'weekday_is_friday',
       'weekday_is_saturday', 'weekday_is_sunday', 'is_weekend', 'LDA_00',
       'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'global_subjectivity',
       'global_sentiment_polarity', 'global_rate_positive_words',
       'global_ra