In [2]:
import pandas as pd

In [3]:
# create a series of datetime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()

In [4]:
s.shape

(10,)

In [5]:
# create some features based on datetime
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
    }

  


In [6]:
features

{'dayofweek': array([0, 0, 0, 1, 1, 2, 2, 2, 3, 3], dtype=int64),
 'dayofyear': array([6, 6, 6, 7, 7, 8, 8, 8, 9, 9], dtype=int64),
 'hour': array([ 0, 10, 20,  6, 16,  2, 12, 22,  8, 18], dtype=int64),
 'is_leap_year': array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]),
 'quarter': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int64),
 'weekofyear': array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)}

### create a bunch of polynomial features

In [3]:
import numpy as np
# generate a random dataframe with
# 2 columns and 100 rows
df = pd.DataFrame(
            np.random.rand(100, 2),
            columns=[f"f_{i}" for i in range(1, 3)]
            )
df.head()

Unnamed: 0,f_1,f_2
0,0.104757,0.940899
1,0.178987,0.767265
2,0.073073,0.597716
3,0.748365,0.192566
4,0.771275,0.488062


In [4]:
from sklearn import preprocessing

# initialize polynomial features class object
# for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(
                                degree=2,
                                interaction_only=False,
                                include_bias=False
                                )

In [5]:
# fit to the features
pf.fit(df)

PolynomialFeatures(degree=2, include_bias=False, interaction_only=False,
                   order='C')

In [6]:
# create polynomial features
poly_feats = pf.transform(df)

In [7]:
# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
                        poly_feats,
                        columns=[f"f_{i}" for i in range(1, num_feats + 1)]
                        )
df_transformed.head()

Unnamed: 0,f_1,f_2,f_3,f_4,f_5
0,0.104757,0.940899,0.010974,0.098566,0.885291
1,0.178987,0.767265,0.032036,0.137331,0.588695
2,0.073073,0.597716,0.00534,0.043677,0.357265
3,0.748365,0.192566,0.56005,0.14411,0.037082
4,0.771275,0.488062,0.594865,0.37643,0.238205


### converts the numbers to categories. It’s known as binning

In [8]:
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"], bins=100, labels=False)

df.head()

Unnamed: 0,f_1,f_2,f_bin_10,f_bin_100
0,0.104757,0.940899,1,10
1,0.178987,0.767265,1,18
2,0.073073,0.597716,0,7
3,0.748365,0.192566,7,75
4,0.771275,0.488062,7,77


### KNNImputer

In [9]:
import numpy as np
from sklearn import impute

In [10]:
# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10, 6))
X

array([[ 6,  1,  7,  8,  5, 12],
       [ 1,  8,  5,  2, 14,  6],
       [10, 10,  9,  7, 14,  6],
       [11, 11,  6,  5,  7, 12],
       [10, 12,  6, 13,  4, 12],
       [ 5, 10, 13,  2, 10,  7],
       [13, 14,  1,  3,  4,  2],
       [11, 10, 14,  2, 11,  4],
       [10, 10,  8,  6,  6,  2],
       [ 6,  3,  5,  9,  6,  6]])

In [12]:
# convert the array to float
X = X.astype(float)
X

array([[ 6.,  1.,  7.,  8.,  5., 12.],
       [ 1.,  8.,  5.,  2., 14.,  6.],
       [10., 10.,  9.,  7., 14.,  6.],
       [11., 11.,  6.,  5.,  7., 12.],
       [10., 12.,  6., 13.,  4., 12.],
       [ 5., 10., 13.,  2., 10.,  7.],
       [13., 14.,  1.,  3.,  4.,  2.],
       [11., 10., 14.,  2., 11.,  4.],
       [10., 10.,  8.,  6.,  6.,  2.],
       [ 6.,  3.,  5.,  9.,  6.,  6.]])

In [14]:
# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan
X

array([[ 6., nan,  7.,  8.,  5., nan],
       [ 1., nan, nan,  2., nan,  6.],
       [nan, 10.,  9., nan, 14.,  6.],
       [11., 11.,  6., nan,  7., 12.],
       [10., nan,  6., 13.,  4., 12.],
       [nan, 10., nan,  2., 10.,  7.],
       [13., 14.,  1.,  3., nan,  2.],
       [11., 10., nan, nan, 11., nan],
       [10., 10.,  8., nan,  6.,  2.],
       [nan,  3., nan, nan, nan,  6.]])

In [15]:
# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)
knn_imputer.fit_transform(X)

array([[ 6. , 10.5,  7. ,  8. ,  5. ,  7. ],
       [ 1. ,  6.5,  8. ,  2. , 12. ,  6. ],
       [ 6. , 10. ,  9. ,  2. , 14. ,  6. ],
       [11. , 11. ,  6. , 10.5,  7. , 12. ],
       [10. , 10.5,  6. , 13. ,  4. , 12. ],
       [ 6. , 10. ,  7.5,  2. , 10. ,  7. ],
       [13. , 14. ,  1. ,  3. , 10.5,  2. ],
       [11. , 10. ,  7.5,  2.5, 11. ,  6.5],
       [10. , 10. ,  8. ,  5. ,  6. ,  2. ],
       [ 5.5,  3. ,  8.5,  2. , 12. ,  6. ]])