<div id="reminder" style="border-radius: 5px; background-color:#f5f5f5; padding: 15px 5px; " >
<p>Use this notebook to follow along with the lab tutorial.</p>
</div>

# <font color="blue">Lesson 4 Feature Engineering and Selection</font>

## Section 1 Handling Categorical Variables

### Read Data from CSV File

In [None]:
import pandas as pd
file = "https://library.startlearninglabs.uw.edu/DATASCI420/Datasets/Tennis.csv"
data = pd.read_csv(file, header=0)
data.head()

### One Hot Encoding

In [None]:
import category_encoders as ce
X = data[["outlook","temp","humidity","windy"]]
le =  ce.OneHotEncoder(return_df=False,impute_missing=False,handle_unknown="ignore")
X_encoded = le.fit_transform(X)
X_encoded[0:5,:]

In [None]:
# print out the category mapping
le.category_mapping

### One-hot Encoding by Pandas

In [None]:
import pandas as pd
data_encoded = pd.get_dummies(data, columns=["outlook","temp","humidity","windy"], \
                             prefix=["outlook","temp","humdity","windy"])
data_encoded.head()

### References for More Complete List of One-hot Encoding Methodologies

Moffitt, C. (2017) <a href="http://pbpython.com/categorical-encoding.html">Guide to Encoding Categorical Values in Python</a>, Practical Business Python.

### Risk Values of Categorical Variables

In [None]:
import numpy as np
General_Prob = (data.iloc[:,4] == "yes").sum()/float(data.shape[0])
variable_risks = {}
for variable in ['outlook', 'temp', 'humidity', 'windy']:
    tab = pd.crosstab(data[variable], data.play) #Create a contingency table
    print(tab)
    num_levels = tab.shape[0]
    level_risk = {}
    levels = list(tab.index)
    level_index = 0
    for lev in levels:
        if lev == True:
            lev = 'True'
        elif lev == False:
            lev = 'False'
        # general_prob is used as asmooth parameters when calculating risk values
        level_risk[lev] = np.log((tab.iloc[level_index, 1] + General_Prob) / (tab.iloc[level_index, 0] + 1 - General_Prob))
        level_index += 1
    variable_risks[variable] = level_risk
print(variable_risks)

### Replace the Original Categorical Values with Risks

In [None]:
data_risks = data.copy()
# data_risks.windy = data_risks.windy.astype(float)
num_obs = data.shape[0]
for variable in ['outlook', 'temp', 'humidity', 'windy']:
    for i in range(num_obs):
        if data[variable][i] == True:
            lev = 'True'
        elif data[variable][i] == False:
            lev = 'False'
        else:
            lev = data[variable][i]
        data_risks[variable][i] = variable_risks[variable][lev]
data_risks.head()

## Section 2. Recency, Frequency, and Monetary (RFM)

### Read Example Data

In [None]:
import pandas as pd
file = "https://library.startlearninglabs.uw.edu/DATASCI420/Datasets/Retail_Churn_Data.csv"
data = pd.read_csv(file, sep=",", header=0)
data.head()

### Convert Timestamp to Datetime Object

In [None]:
import datetime as dt
import pandas as pd
import numpy as np
from collections import OrderedDict

data["Timestamp"] = pd.to_datetime(data["Timestamp"], format='%m/%d/%Y %H:%M')
data.head()
print("Minimal Date=%s, Maximal Date=%s"%(min(data["Timestamp"]).strftime("%Y-%m-%d %H:%M"), \
                                          max(data["Timestamp"]).strftime("%Y-%m-%d %H:%M")))



### Calculate RFM Features for Users at Each Checkpoint

In [None]:
Start_Date_Obj = dt.datetime.strptime("1/1/2001", "%m/%d/%Y")
End_Date_Obj = dt.datetime.strptime("1/10/2001", "%m/%d/%Y")
Time_Window = 60 #days. Only consider customers who have activities within the recent 60 days
FM_Window = 7 #days for frequency and monetary

check_point_date = Start_Date_Obj
UserID = []
Checkpoint = []
Recency = []
Frequency = []
Monetary_Value = []
Monetary_Quantity = []
while check_point_date <= End_Date_Obj:
    window_start = check_point_date - dt.timedelta(days = Time_Window)
    mask = (data["Timestamp"] >= window_start) & (data["Timestamp"] < check_point_date)
    # Get the data in [checkpoint-60days, checkpoint]
    data_checkpoint = data.loc[mask]
    # Get the ids of users who have activities in [checkpoint-60days, checkpoint]
    unique_users = list(set(data_checkpoint["UserId"]))
    print("There are %d unique users."%(len(unique_users)))
    FM_Window_Start = check_point_date - dt.timedelta(days = FM_Window)
    for user in unique_users:
        UserID.append(user)
        Checkpoint.append(check_point_date)
        mask = data_checkpoint["UserId"] == user
        data_checkpoint_user = data_checkpoint.loc[mask]
        delta = check_point_date - max(data_checkpoint_user["Timestamp"])
        recency = delta.days #Recency, days between checkpoint and last transaction time
        mask = data_checkpoint_user["Timestamp"] >= FM_Window_Start
        data_checkpoint_user_fm = data_checkpoint_user.loc[mask]
        frequency = data_checkpoint_user_fm.shape[0]
        value = np.sum(data_checkpoint_user_fm.iloc[:, 8]) #monetary values
        quantity = np.sum(data_checkpoint_user_fm.iloc[:, 7])#monetary quantity
        Recency.append(recency)
        Frequency.append(frequency)
        Monetary_Value.append(value)
        Monetary_Quantity.append(quantity)
    check_point_date = check_point_date + dt.timedelta(days = 1)
# Consolidate all columns into a signle data frame
RFM_Dict = OrderedDict([ ('UserID', UserID),
          ('Checkpoint', Checkpoint),
          ('Recency',  Recency),
          ('Frequency', Frequency),
          ('Value', Monetary_Value),
          ('Quantity', Monetary_Quantity)] )
RFM_Frame = pd.DataFrame.from_dict(RFM_Dict)
RFM_Frame.head()


## Section 3. Filter-based Selection

### Mutual Information

In [None]:
import numpy as np

X = np.linspace(start = -1, stop = 3, num=401, endpoint=True)
Y = X**2 - 2*X + 1
print("Standard Deviation of Y=%.2f"%np.std(Y))

noise_var = 0.1
noise = np.random.normal(0, noise_var, len(X))
Y += noise
#Y = noise

import matplotlib.pyplot as plt

plt.plot(X, Y)
plt.show()

# Calculation Correlation
corr = np.corrcoef(X, Y)[0, 1]
print("Correlation between X and Y is %.2f"%corr)
# Calculate Mutual Information

from sklearn.metrics import mutual_info_score

def calc_MI(x, y, bins):
    c_xy = np.histogram2d(x, y, bins)[0]
    mi = mutual_info_score(None, None, contingency=c_xy)
    return mi

mi = calc_MI(X, Y, 20)
print("Mutual information=%.2f"%mi)

## Section 4. Stepwise and Embedded Methods

### Stepwise Model Selection

#### Backward model selection

In [None]:
# Recursive Feature Elimination
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE #Recursive Feature Elimination
from sklearn.linear_model import LinearRegression
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
print(X[0:10,:]) # print out the first 10 rows
estimator = LinearRegression()
selector = RFE(estimator, 5, step=1)#select 5 features. Step=1 means each step only remove 1 variable from the model
selector = selector.fit(X, y)
print(selector.support_) # The mask of selected features.
print(selector.ranking_) # selected features are ranked 1. The 6th is the one that is removed first,
                         # 2nd is the one that is removed last

### Install Machine Learning Extensions
See <a href="http://rasbt.github.io/mlxtend/">mlxtend's documentation</a>

In [None]:
# Uncomment the following line to run
# !pip install mlxtend

#### Forward Feature Selection

In [None]:
# Forward Stepwise Feature Selection
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_boston
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs

boston = load_boston()
X, y = boston.data, boston.target

print(X[0:10, :])
lr = LinearRegression()

sfs = SFS(lr, 
          k_features=13, # k_features has to be smaller or equal to the number of features. If equal to, it starts from
                         # intercept to the full model
          forward=True,  # forward
          floating=False, 
          scoring='neg_mean_squared_error',
          cv=10)

sfs = sfs.fit(X, y)
fig = plot_sfs(sfs.get_metric_dict(), kind='std_err')

plt.title('Sequential Forward Selection (w. StdErr)')
plt.grid()
plt.show()

### Get the Best Model

In [None]:
sfs.get_metric_dict()[8]

### LASSO and Ridge Regression

In [None]:
# LASSO
from sklearn import linear_model

alpha = 0.5 # Increasing alpha can shrink more variable coefficients to 0
clf = linear_model.Lasso(alpha=alpha)
clf.fit(X, y)

print(clf.coef_)

print(clf.intercept_)



In [None]:
# Ridge Regression
from sklearn import linear_model
alpha = 10 
clf = linear_model.Ridge(alpha=alpha)
clf.fit(X, y)

print(clf.coef_)

print(clf.intercept_)

import numpy as np
# Increasing alpha can compress the L2 norm of the coefficients to 0 (but not selecting variables)
print("Sum of square of coefficients = %.2f"%np.sum(clf.coef_**2)) 

<div id="reminder" style="border-radius: 5px; background-color:#f5f5f5; padding: 15px 5px; " >
<p>For additional practice, please see the Workshop notebooks.</p>
</div>