<font color = "#CC3D3D">
## The effects of Feature engineering

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
choice = int(input())

if choice == 1: 
    model = DecisionTreeClassifier(random_state=0)
elif choice == 2:
    model = RandomForestClassifier(random_state=0)
else:
    model = GradientBoostingClassifier(random_state=0)

In [None]:
def model_perf(X_train, X_test, y_train, y_test):
    print(model.fit(X_train, y_train).score(X_test, y_test))
    
def get_data(ftrain, ftest, freal):
    df_train = pd.read_csv(ftrain, encoding='cp949')
    df_test = pd.read_csv(ftest, encoding='cp949')
    df_real = pd.read_csv(freal, usecols=[1])
    
    X_train = df_train.drop(['CUS_ID', 'GENDER'], axis=1)
    y_train = df_train['GENDER']
    X_test = df_test.drop(['CUS_ID'], axis=1)
    y_test = df_real['GENDER']
    return((X_train, X_test, y_train, y_test))

### Modeling with poor features

In [None]:
X_train, X_test, y_train, y_test = get_data('train_base.csv', 'test_base.csv', 'test_y.csv')
model_perf(X_train, X_test, y_train, y_test)

### Modeling with good features

In [None]:
X_train, X_test, y_train, y_test = get_data('train_6964.csv', 'test_6964.csv', 'test_y.csv')
model_perf(X_train, X_test, y_train, y_test)

### Feature Engineering

##### Univariate Non-linear transformations

In [None]:
X_train, X_test, y_train, y_test = get_data('train_6964.csv', 'test_6964.csv', 'test_y.csv')

plt.subplot(211)
X_train.man.hist(bins=20, figsize=(8,6))
plt.subplot(212)
X_train.man.add(1).apply(np.log).hist(bins=20)

In [None]:
X_train = np.log(X_train + 1)
X_test = np.log(X_test + 1)
# np.exp(), np.sin()

model_perf(X_train, X_test, y_train, y_test)

##### Interactions and Polynomials

In [None]:
from sklearn.preprocessing import PolynomialFeatures

X_train, X_test, y_train, y_test = get_data('train_6964.csv', 'test_6964.csv', 'test_y.csv')

poly = PolynomialFeatures(degree=2, include_bias=True).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)
# poly.get_feature_names()

model_perf(X_train, X_test, y_train, y_test)

##### All together

In [None]:
X_train, X_test, y_train, y_test = get_data('train_6964.csv', 'test_6964.csv', 'test_y.csv')

# Polynomial features, first
poly = PolynomialFeatures(degree=2, include_bias=True).fit(X_train)
X_train = poly.transform(X_train)
X_test = poly.transform(X_test)

# Log transformations
X_train = np.log(X_train + 1)
X_test = np.log(X_test + 1)

model_perf(X_train, X_test, y_train, y_test)

<font color = "#CC3D3D">
## End