Numeric features

In [None]:
import pandas as pd
import numpy as np

# scaling
from sklearn.preprocessing import StandardScaler, MinMaxScaler
X_standard = StandardScaler().fit_transform(X)
X_minmax = MinMaxScaler().fit_transform(X)

# Winsorization for outliers
UB, LB = np.percentile(X, [1, 99]) # 1st and 99th percentile
y = np.clip(X, UB, LB)

# rank transfomration
from scipy.stats import rankdata
rankdata([-100, 0, 1e5]) # [1., 2., 3.]
 
# log transform
np.log(1 + X)

# raising to power
np.sqrt(X + 2/3)
np.power(X + 2/3, 1/2) # same as above

# feature generation
df['unit_price'] = df['price'] / df['area']
df['decimal'] = (np.modf(df['dollars'])[0] * 100).astype(int)

categorical and ordinal features

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

## label encoding
lab = LabelEncoder()
df['column'] = lab.fit_transform(df['column'])


## frequency encoding
encoding = (df.groupby('column').size()) / len(train)
df['encoding'] = df['column'].apply(lambda x : encoding[x])


## one-hot encoding
df = pd.get_dummies(df, columns = ['col1', 'col2'])

## feature generation
df['col1_col2'] = df['col1'] + df['col2']

datetime and coordinates

In [None]:
# pip install holidays
import holidays
import datetime
us_holidays = holidays.US()

# ex: 2016-01-02 01:00:00
df['weekday'] = df.Date.dt.weekday
df['year'] = df.Date.dt.year
df['quarter'] = df.Date.dt.quarter
df['weekofyear'] = df.Date.dt.weekofyear
df['dayofweek'] = df.Date.dt.dayofweek
df['dayofweek_name'] = df.Date.dt.weekday_name
df['month'] = df.Date.dt.month
df['day'] = df.Date.dt.day
df['hour'] = df.Date.dt.hour
df['second'] = df.Date.dt.second
df['minute'] = df.Date.dt.minute

df["is_holiday"] = df.Date.dt.floor('d').isin(us_holidays)
df['is_weekend'] = np.where(df['dayofweek_name'].isin(['Sunday','Saturday']), 1, 0)

# time since today
df['time_since'] = datetime.datetime.today() - df.Date

# difference bewteen two dates
df['diff_time'] = (df['date_1'] - df['date_2'])
df['diff_days'] = (df['date_1'] - df['date_2']) / np.timedelta64(1, 'D')
df['diff_weeks'] = (df['date_1'] - df['date_2']) / np.timedelta64(1, 'W')
df['diff_months'] = (df['date_1'] - df['date_2']) / np.timedelta64(1, 'M')
df['diff_years'] = (df['date_1'] - df['date_2']) / np.timedelta64(1, 'Y')

missing values

In [None]:
import numpy as np

# replace with nan
df.replace({"-":np.nan, "?":np.nan}, inplace=True)

# replace with -999
df["column"].fillna(-999, inplace = True)

# replace with mean/median
df["age"].fillna(df["age"].mean(), inplace=True)
df["age"].fillna(df["age"].median(), inplace=True)

# other methods
df.fillna(method ='pad') # forward fill
df.fillna(method ='bfill') # backward fill
df.interpolate(method ='linear') # linear interpolation

Text data

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

# bag of words
# Transforms text into a sparse matrix of n-gram counts.
vect = CountVectorizer()
word_counts = vect.fit_transform(corpus)

# tfidf
# Transform a count matrix to a normalized tf or tf-idf representation.
transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(word_counts)
tfidf

# BOW + TFIDF
# Convert a collection of raw documents to a matrix of TF-IDF features
vect = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# ngrams
from nltk import ngrams
sentence = "this is a sentence"
bigrams = ngrams(sentence.split(), 2)
trigrams = ngrams(sentence.split(), 3)

# word2vec
import gensim
from gensim.models import Word2Vec

# CBOW approach by default
model = gensim.models.Word2Vec(corpus, min_count = 1,
                              vector_size = 100, window = 5, sg=0)

# skipgram
model = gensim.models.Word2Vec(corpus, min_count = 1,
                              vector_size = 100, window = 5, sg=1)