To use these functions in another file, add the following:
%run preprocess.ipynb

In [31]:
import sys
sys.path.append('/usr/local/lib/python3.9/site-packages/')

from ucimlrepo import fetch_ucirepo
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [32]:
online_news_popularity = fetch_ucirepo(id=332)
X = online_news_popularity.data.features
y = online_news_popularity.data.targets

In [33]:
X.head()

Unnamed: 0,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,num_videos,average_token_length,...,avg_positive_polarity,min_positive_polarity,max_positive_polarity,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity
0,12.0,219.0,0.663594,1.0,0.815385,4.0,2.0,1.0,0.0,4.680365,...,0.378636,0.1,0.7,-0.35,-0.6,-0.2,0.5,-0.1875,0.0,0.1875
1,9.0,255.0,0.604743,1.0,0.791946,3.0,1.0,1.0,0.0,4.913725,...,0.286915,0.033333,0.7,-0.11875,-0.125,-0.1,0.0,0.0,0.5,0.0
2,9.0,211.0,0.57513,1.0,0.663866,3.0,1.0,1.0,0.0,4.393365,...,0.495833,0.1,1.0,-0.466667,-0.8,-0.133333,0.0,0.0,0.5,0.0
3,9.0,531.0,0.503788,1.0,0.665635,9.0,0.0,1.0,0.0,4.404896,...,0.385965,0.136364,0.8,-0.369697,-0.6,-0.166667,0.0,0.0,0.5,0.0
4,13.0,1072.0,0.415646,1.0,0.54089,19.0,19.0,20.0,0.0,4.682836,...,0.411127,0.033333,1.0,-0.220192,-0.5,-0.05,0.454545,0.136364,0.045455,0.136364


In [34]:
def get_raw_data():
    return X, y

In [35]:
# We're going to need this function when we process an article to input to the model
def get_scaler(df):
    scaler = StandardScaler()
    scaler.fit(df)

    return scaler

In [36]:
def normalize_one(df, verbose=False):
    print(type(df))
    columns = df.columns
    scaler = get_scaler(df)
    if verbose:
        print('before:')
        print(df.mean(axis=0))
    df = scaler.transform(df)
    if verbose:
        print('after:')
        print(df.mean(axis=0))
    print(type(df))

    
    return df

In [37]:
def normalize(X, y, verbose=False):
    X = normalize_one(X, verbose)
    y = normalize_one(y, verbose)

    return X, y


In [38]:
def split(X, y, test_size, val_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=val_size/(1-test_size), random_state=1)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [39]:
def get_processed_data(test_size=0.1, val_size=0.1, verbose=False):
    X, y = get_raw_data()
    X, y = normalize(X, y, verbose)
    X_train, y_train, X_val, y_val, X_test, y_test = split(X, y, test_size, val_size)

    return X_train, y_train, X_val, y_val, X_test, y_test

In [40]:
X_train, y_train, X_val, y_val, X_test, y_test = get_processed_data(test_size=0.2, val_size=0.3)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(f"targets: {y_train}")
print(f"data: {X_train}")

<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
<class 'pandas.core.frame.DataFrame'>
<class 'numpy.ndarray'>
(19821, 58)
(11894, 58)
(7929, 58)
targets: [[-0.14581653]
 [-0.15441734]
 [ 0.11220795]
 ...
 [ 0.09500632]
 [-0.25023043]
 [-0.1888206 ]]
data: [[-1.60772590e+00 -4.74451259e-01  1.78983723e-03 ...  5.47157435e-01
  -1.19274310e+00  2.67809642e-01]
 [ 1.23048186e+00  9.19437671e+00 -7.39034343e-02 ...  2.44637285e-01
  -1.56994907e+00 -8.70560322e-02]
 [ 7.57447233e-01 -3.51335557e-01  3.25676620e-03 ...  1.23781665e+00
  -2.21638357e-01  1.07797467e+00]
 ...
 [ 1.23048186e+00 -7.27050716e-01  2.79063632e-02 ...  1.23781665e+00
  -2.21638357e-01  1.07797467e+00]
 [ 2.84412606e-01  3.28703405e-02 -9.31336282e-03 ... -3.05226384e-01
  -1.13121305e+00 -6.47252789e-01]
 [-1.88622020e-01 -6.59124812e-01  1.32395104e-02 ... -2.69076177e-01
   8.37748634e-01 -6.89658121e-01]]
