# Feature Engineering

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import  StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import accuracy_score, classification_report

In [2]:
df = pd.read_csv("../Dataset/conversion_data.csv")
df.head(10)

Unnamed: 0,country,age,new_user,source,total_pages_visited,converted
0,UK,25,1,Ads,1,0
1,US,23,1,Seo,5,0
2,US,28,1,Seo,4,0
3,China,39,1,Seo,5,0
4,US,30,1,Seo,6,0
5,US,31,0,Seo,1,0
6,China,27,1,Seo,4,0
7,US,23,0,Ads,4,0
8,UK,29,0,Direct,4,0
9,US,25,0,Ads,2,0


In [5]:
# Separate target variable Y from features X
print("Separating labels from features...")
features_list = ["country", "age", "new_user", "source", "total_pages_visited"]
target_variable = "converted"

X = df.loc[:,features_list]
y = df.loc[:,target_variable]

print("...Done.")
print()

print('y : ')
print(y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

y : 
0    0
1    0
2    0
3    0
4    0
Name: converted, dtype: int64

X :
  country  age  new_user source  total_pages_visited
0      UK   25         1    Ads                    1
1      US   23         1    Seo                    5
2      US   28         1    Seo                    4
3   China   39         1    Seo                    5
4      US   30         1    Seo                    6


In [4]:
df.shape

(316200, 6)

In [6]:
# Search categorical features and numeric features

idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
  if ('float' in str(t)) or ('int' in str(t)) :
    numeric_features.append(i)
    numeric_indices.append(idx)
  else :
    categorical_features.append(i)
    categorical_indices.append(idx)

  idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['age', 'new_user', 'total_pages_visited']  at positions  [1, 2, 4]
Found categorical features  ['country', 'source']  at positions  [0, 3]


In [7]:
# Divide dataset Train set & Test set 
print("Dividing into train and test sets...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=42, stratify=y)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



In [8]:
# Convert pandas DataFrames to numpy arrays before using scikit-learn
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
y_train = y_train.tolist()
y_test = y_test.tolist()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(y_train[0:5])
print(y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[['US' 27 1 'Seo' 19]
 ['US' 34 0 'Seo' 1]
 ['China' 23 1 'Seo' 5]
 ['China' 42 0 'Seo' 7]
 ['UK' 24 1 'Direct' 4]]
[['China' 33 1 'Ads' 3]
 ['US' 31 1 'Direct' 4]]

[1, 0, 0, 0, 0]
[0, 0]


# Encoding & Normalizing

In [9]:
# Encoding categorical features and standardizing numerical features
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])
print()
print(X_test[0:5,:])

# Normalization
numeric_transformer = StandardScaler()

# OneHotEncoder
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_indices),
        ('cat', categorical_transformer, categorical_indices)        
        ]
    )

X_train = featureencoder.fit_transform(X_train)
X_test = featureencoder.transform(X_test)
print("...Done")
print(X_train[0:5,:])
print(X_test[0:5])

Encoding categorical features and standardizing numerical features...

[['US' 27 1 'Seo' 19]
 ['US' 34 0 'Seo' 1]
 ['China' 23 1 'Seo' 5]
 ['China' 42 0 'Seo' 7]
 ['UK' 24 1 'Direct' 4]]

[['China' 33 1 'Ads' 3]
 ['US' 31 1 'Direct' 4]
 ['US' 42 1 'Ads' 2]
 ['China' 40 0 'Seo' 4]
 ['US' 25 1 'Seo' 10]]
...Done
[[-0.43186335  0.67712916  4.22843969  0.          0.          1.
   0.          1.        ]
 [ 0.41465401 -1.47682312 -1.16004158  0.          0.          1.
   0.          1.        ]
 [-0.91558756  0.67712916  0.03739871  0.          0.          0.
   0.          1.        ]
 [ 1.38210242 -1.47682312  0.63611885  0.          0.          0.
   0.          1.        ]
 [-0.79465651  0.67712916 -0.26196136  0.          1.          0.
   1.          0.        ]]
[[ 0.29372296  0.67712916 -0.56132143  0.          0.          0.
   0.          0.        ]
 [ 0.05186085  0.67712916 -0.26196136  0.          0.          1.
   1.          0.        ]
 [ 1.38210242  0.67712916 -0.8606815

In [11]:
print('Encoding labels.....')
print(y_train[0:5])
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
print('....Done')
print(y_train[0:5])

Encoding labels.....
[1, 0, 0, 0, 0]
....Done
[1 0 0 0 0]


# Training model

In [12]:
# Train model
print("Train model...")
classifier = RandomForestClassifier()

params = {
    'n_estimators': [10, 50, 100, 150, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 6]
}

best_forest = GridSearchCV(classifier, params, cv=10, n_jobs=-1, verbose=2)
best_forest.fit(X_train, y_train)
print("...Done.")

Train model...
Fitting 10 folds for each of 120 candidates, totalling 1200 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  4.8min


KeyboardInterrupt: 