In [1]:
import pandas as pd

# create a series of datatime with a frequency of 10 hours
s = pd.date_range('2020-01-06', '2020-01-10', freq='10H').to_series()

# create some features based on datatime
features = {
    "dayofweek": s.dt.dayofweek.values,
    "dayofyear": s.dt.dayofyear.values,
    "hour": s.dt.hour.values,
    "is_leap_year": s.dt.is_leap_year.values,
    "quarter": s.dt.quarter.values,
    "weekofyear": s.dt.weekofyear.values
}

  "weekofyear": s.dt.weekofyear.values


In [2]:
print(features)

{'dayofweek': array([0, 0, 0, 1, 1, 2, 2, 2, 3, 3]), 'dayofyear': array([6, 6, 6, 7, 7, 8, 8, 8, 9, 9]), 'hour': array([ 0, 10, 20,  6, 16,  2, 12, 22,  8, 18]), 'is_leap_year': array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True]), 'quarter': array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), 'weekofyear': array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])}


In [3]:
def generate_features(df):
    # create a bunch of features using the data column
    df.loc[:, 'year'] = df['date'].dt.year
    df.loc[:, 'weekofyear'] = df['date'].dt.weekofyear
    df.loc[:, 'month'] = df['date'].dt.month
    df.loc[:, 'dayofweek'] = df['date'].dt.dayofweek
    df.loc[:, 'weekend'] = (df['date'].dt.weekday >=5).astype(int)

    # create an aggregate dictionary
    aggs = {}
    # for aggregation by month, we calculate the 
    # number of unique month values and also the mean
    aggs['month'] = ['nunique', 'mean']
    aggs['weekofyear'] = ['nunique', 'mean']
    # we aggregate by num1 and calculate sum, max, min
    # and mean values of this column
    aggs['num1'] = ['sum', 'max', 'min', 'mean']
    # for customer_id, we calculate the total count
    aggs['customer_id'] = ['size']
    # again for customer_id, we calculate the total unique
    aggs['customer_id'] = ['nunique']

    # we group by customer_id and calculate the aggregates
    agg_df = df.groupby('customer_id').agg(aggs)
    agg_df = agg_df.reset_index()
    return agg_df

In [4]:
import numpy as np

x = 0

feature_dict = {}

# calculate mean
feature_dict['mean'] = np.mean(x)

# calculate max
feature_dict['max'] = np.max(x)

# calculate min 
feature_dict['min'] = np.min(x)

# calculate standard dviation 
feature_dict['std'] = np.std(x)

# calculate variance 
feature_dict['var'] = np.var(x)

# peak-to-peak 
feature_dict['ptp'] = np.ptp(x)

# percentile features
feature_dict['percentile_10'] = np.percentile(x, 10)
feature_dict['percentile_60'] = np.percentile(x, 60)
feature_dict['percentile_90'] = np.percentile(x, 90)

# quantile features
feature_dict['quantile_5'] = np.quantile(x, 0.05)
feature_dict['quantile_95'] = np.quantile(x, 0.95)
feature_dict['quantile_99'] = np.quantile(x, 0.99)



In [13]:
from tsfresh.feature_extraction import feature_calculators as fc
# tsfresh based features 
feature_dict['abs_energy'] = fc.abs_energy(x)
feature_dict['count_above_mean'] = fc.count_above_mean(x)
feature_dict['count_below_mean'] = fc.count_below_mean(x)
feature_dict['mean_abs_change'] = fc.mean_abs_change(x)
feature_dict['mean_change'] = fc.mean_change(x)

ValueError: diff requires input that is at least one dimensional

In [19]:
# A random dataframe with two numerical features
import numpy as np

# generate a random dataframe with
# 2 columns and 100 rows
df = pd.DataFrame(
    np.random.rand(100,2),
    columns=[f"f_{i}" for i in range(1,3)]
)
df.shape

(100, 2)

In [5]:
# A sample dataframe with polynomial features

from sklearn import preprocessing

# initialize polynomial features class object
# for two-degree polynomial features
pf = preprocessing.PolynomialFeatures(
    degree=2,
    interaction_only=False,
    include_bias=False
)

# fit to the features
pf.fit(df)

# create polynomial features
poly_feats = pf.transform(df)

# create a dataframe with all the features
num_feats = poly_feats.shape[1]
df_transformed = pd.DataFrame(
    poly_feats,
    columns=[f"f_{i}" for i in range(1, num_feats + 1)]
)

NameError: name 'df' is not defined

In [6]:
from sklearn import preprocessing

In [24]:
df = df_transformed.head()

In [25]:
# create bins of the numerical columns
# 10 bins
df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
# 100 bins
df["f_bin_100"] = pd.cut(df["f_1"],bins=100, labels=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["f_bin_10"] = pd.cut(df["f_1"], bins=10, labels=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["f_bin_100"] = pd.cut(df["f_1"],bins=100, labels=False)


In [26]:
df["f_3"]


0    0.282115
1    0.473067
2    0.191579
3    0.533392
4    0.863071
Name: f_3, dtype: float64

In [27]:
df.f_3.var()

0.06783568766142445

In [28]:
df.f_3.apply(lambda x: np.log(1+x)).var()

0.029968843877698933

### KNNImputer for handling missing values

In [7]:
import numpy as np
from sklearn import impute

# create a random numpy array with 10 samples
# and 6 features and values ranging from 1 to 15
X = np.random.randint(1, 15, (10,6))

# convert the array to float
X = X.astype(float)

print(X, "\n")

# randomly assign 10 elements to NaN (missing)
X.ravel()[np.random.choice(X.size, 10, replace=False)] = np.nan

print(X,"\n")
# use 2 nearest neighbours to fill na values
knn_imputer = impute.KNNImputer(n_neighbors=2)

X = knn_imputer.fit_transform(X)

print(X)

[[ 6.  4.  5.  9. 13. 14.]
 [ 4.  6.  7.  6.  5.  4.]
 [ 7. 14. 13.  1.  3.  8.]
 [ 1.  3.  4.  4.  5.  1.]
 [ 6.  6.  6.  8. 12.  1.]
 [11.  7. 14.  8. 12. 10.]
 [ 2.  9. 11. 14.  7.  4.]
 [ 7.  9. 13.  2.  4. 10.]
 [ 5.  1.  2.  5. 14.  8.]
 [ 1. 14.  4. 14.  6.  7.]] 

[[ 6.  4.  5.  9. 13. 14.]
 [ 4.  6.  7.  6.  5. nan]
 [ 7. 14. 13. nan  3.  8.]
 [ 1.  3.  4.  4.  5.  1.]
 [nan  6.  6.  8. 12.  1.]
 [11.  7. nan  8. 12. 10.]
 [ 2.  9. 11. nan  7. nan]
 [nan  9. 13.  2.  4. 10.]
 [ 5.  1.  2. nan 14. nan]
 [ 1. nan  4. 14.  6.  7.]] 

[[ 6.   4.   5.   9.  13.  14. ]
 [ 4.   6.   7.   6.   5.   1. ]
 [ 7.  14.  13.   4.   3.   8. ]
 [ 1.   3.   4.   4.   5.   1. ]
 [ 4.5  6.   6.   8.  12.   1. ]
 [11.   7.   5.5  8.  12.  10. ]
 [ 2.   9.  11.   4.   7.   8.5]
 [ 4.5  9.  13.   2.   4.  10. ]
 [ 5.   1.   2.   8.5 14.   7.5]
 [ 1.   7.5  4.  14.   6.   7. ]]


In [36]:
knn_imputer.fit_transform(X)

array([[ 1. ,  6. ,  2. ,  8. , 12. ,  6.5],
       [12. , 11. ,  6. , 13. ,  8.5,  7. ],
       [ 7. ,  7.5, 14. ,  7. ,  6. ,  3. ],
       [13. ,  1. ,  8. ,  2. ,  5. , 14. ],
       [ 3. ,  7.5,  9. ,  4. ,  3. , 10.5],
       [14. ,  5. ,  1. ,  8. ,  6. ,  7. ],
       [ 5. , 10. , 10. ,  8. ,  6. , 12. ],
       [ 5. , 10.5, 11. ,  9. , 11. ,  7.5],
       [ 5. ,  5. , 14. ,  3. ,  1. ,  9. ],
       [12. ,  2. ,  1. ,  8. ,  7. ,  1. ]])