# Feature Engineering with Model Parameter Tunning  

## Setup 

In [1]:
import pandas as pd
from pandas import Series,DataFrame
import numpy as np

from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

## Generate a simple plot of the test and training learning curve

In [3]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    """
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt

## Load titanic `train.cvs` and `test.csv` files

In [4]:
X_train = pd.read_csv("./train.csv")
y_train = X_train.pop("Survived")

X_test_orig = pd.read_csv("./test.csv")

## Viewing the first 5 lines of the data with `df.head()` 

In [5]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Feature engineering

In [7]:
data = X_train.append(X_test_orig, ignore_index=True)

### Pandas Data Selection with `data.iloc`

In [8]:
X_train = data.iloc[:891]
X_test = data.iloc[891:]

In [9]:
data['Title'] = data.Name.apply(lambda name: name.split(',')[1].split('.')[0].strip())
data.Title.value_counts()

Mr              757
Miss            260
Mrs             197
Master           61
Rev               8
Dr                8
Col               4
Ms                2
Mlle              2
Major             2
Jonkheer          1
Don               1
Mme               1
Capt              1
Sir               1
the Countess      1
Lady              1
Dona              1
Name: Title, dtype: int64

## Create `Beings`

In [10]:
normalized_titles = {
    "Capt":       "Officer",
    "Col":        "Officer",
    "Major":      "Officer",
    "Jonkheer":   "Royalty",
    "Don":        "Royalty",
    "Sir" :       "Royalty",
    "Dr":         "Officer",
    "Rev":        "Officer",
    "the Countess":"Royalty",
    "Dona":       "Royalty",
    "Mme":        "Mrs",
    "Mlle":       "Miss",
    "Ms":         "Mrs",
    "Mr" :        "Mr",
    "Mrs" :       "Mrs",
    "Miss" :      "Miss",
    "Master" :    "Master",
    "Lady" :      "Royalty"
}
data.Title = data.Title.map(normalized_titles)
data.Title.value_counts()

Mr         757
Miss       262
Mrs        200
Master      61
Officer     23
Royalty      6
Name: Title, dtype: int64

## count of missing values with `df.isnull().sum()`

In [11]:
data.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Title             0
dtype: int64

## group by Sex, Pclass, and Title with `df.groupby`

In [12]:
grouped = data.groupby(['Sex','Pclass', 'Title'])

## get the median of the label Age for the grouped data with `median()`

In [13]:
grouped.Age.median()

Sex     Pclass  Title  
female  1       Miss       30.0
                Mrs        45.0
                Officer    49.0
                Royalty    39.0
        2       Miss       20.0
                Mrs        30.0
        3       Miss       18.0
                Mrs        31.0
male    1       Master      6.0
                Mr         41.5
                Officer    52.0
                Royalty    40.0
        2       Master      2.0
                Mr         30.0
                Officer    41.5
        3       Master      6.0
                Mr         26.0
Name: Age, dtype: float64

In [14]:
data.Age = grouped.Age.apply(lambda x: x.fillna(x.median()))
data.isnull().sum()

PassengerId       0
Pclass            0
Name              0
Sex               0
Age               0
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Title             0
dtype: int64

## Create a new bin named "CatAge" with `pd.qcut`

In [15]:
data['CatAge'] = pd.qcut(data.Age, q=4, labels=False )

In [16]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,CatAge
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,3
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,2


## Add a new feature named "Family_Size" 

In [17]:
data['Family_Size'] = data['Parch'] + data['SibSp'] + 1

In [18]:
data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title,CatAge,Family_Size
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr,1,2
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs,3,2
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss,1,1
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs,2,2
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr,2,1


## Drop columns won't be useful in analysis and prediction with `data.drop`

In [19]:
data = data.drop(['PassengerId','Name','Ticket', "Cabin"], axis=1)

In [20]:
data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title,CatAge,Family_Size
0,3,male,22.0,1,0,7.25,S,Mr,1,2
1,1,female,38.0,1,0,71.2833,C,Mrs,3,2
2,3,female,26.0,0,0,7.925,S,Miss,1,1
3,1,female,35.0,1,0,53.1,S,Mrs,2,2
4,3,male,35.0,0,0,8.05,S,Mr,2,1


## apply 1-hot encoding to categorical features `Sex`, `Embarked` and `Title`

In [21]:
data.dtypes

Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
Title           object
CatAge           int64
Family_Size      int64
dtype: object

## Convert categorical variable into dummy/indicator variables with `pd.get_dummies`

In [22]:
data = pd.get_dummies(data, columns=['Sex'], prefix = ['Sex'])
data = pd.get_dummies(data, columns=['Embarked'], prefix = ['Embarked'])
data = pd.get_dummies(data, columns=['Title'], prefix = ['Title'])

In [23]:
data.dtypes

Pclass             int64
Age              float64
SibSp              int64
Parch              int64
Fare             float64
CatAge             int64
Family_Size        int64
Sex_female         uint8
Sex_male           uint8
Embarked_C         uint8
Embarked_Q         uint8
Embarked_S         uint8
Title_Master       uint8
Title_Miss         uint8
Title_Mr           uint8
Title_Mrs          uint8
Title_Officer      uint8
Title_Royalty      uint8
dtype: object

In [24]:
data.isnull().sum()

Pclass           0
Age              0
SibSp            0
Parch            0
Fare             1
CatAge           0
Family_Size      0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Title_Master     0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
Title_Royalty    0
dtype: int64

## Fill NA/NaN values using the data.mean( ) with `pd.fillna`

In [26]:
data["Age"].fillna(data.Age.mean(), inplace=True)
data.isnull().sum()

Pclass           0
Age              0
SibSp            0
Parch            0
Fare             1
CatAge           0
Family_Size      0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Title_Master     0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
Title_Royalty    0
dtype: int64

In [27]:
data["Fare"].fillna(data.Fare.mean(), inplace=True)
data.isnull().sum()

Pclass           0
Age              0
SibSp            0
Parch            0
Fare             0
CatAge           0
Family_Size      0
Sex_female       0
Sex_male         0
Embarked_C       0
Embarked_Q       0
Embarked_S       0
Title_Master     0
Title_Miss       0
Title_Mr         0
Title_Mrs        0
Title_Officer    0
Title_Royalty    0
dtype: int64

In [28]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [29]:
X_train = data.iloc[:891]
X_test = data.iloc[891:]

## Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier

### Withhold a small amount of data for testing data from the dataset

In [31]:
X_train_small = data.iloc[:800]
X_test_small = data.iloc[800:891]
y_train_small = y_train.iloc[:800]
y_test_small = y_train.iloc[800:891]

In [32]:
dt1 = DecisionTreeClassifier(random_state=42)
dt1.fit(X_train_small, y_train_small)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

In [33]:
from sklearn.metrics import roc_curve, auc
y_pred_small = dt1.predict(X_test_small)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_small, y_pred_small)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

0.7858617131062952

## Gid Search CV to to Fine Tune the Hyper Parameters

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV

clf = RandomForestClassifier()

parameters = {'n_estimators': [4, 6, 9], 
              'max_features': ['log2', 'sqrt','auto'], 
              'criterion': ['entropy', 'gini'],
              'max_depth': [2, 3, 5, 10], 
              'min_samples_split': [2, 3, 5],
              'min_samples_leaf': [1,5,8]
             }


acc_scorer = make_scorer(accuracy_score)

grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer)
grid_obj = grid_obj.fit(X_train, y_train)

clf = grid_obj.best_estimator_

clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [35]:
predictions = clf.predict(X_test)
y_pred = clf.predict(X_test)

In [36]:
clf_scores = cross_val_score(clf, X_train, y_train, cv= 8)
clf_scores.mean()

0.8226886992511993

In [37]:
cross_val_score(clf, X_train, y_train, cv=3, scoring="accuracy")

array([0.82154882, 0.84848485, 0.83838384])

## Hyper-parameters

In [38]:
dt_params = [
   { 'max_depth': [1, 2, 4, 8, 16, 64], 
     'min_samples_leaf' : [1, 2, 3, 4, 5, 6],
   },
]

In [39]:
dt_cv = GridSearchCV(estimator = clf, param_grid=dt_params, cv=4)
dt_cv.fit(X_train, y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=4, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'max_depth': [1, 2, 4, 8, 16, 64], 'min_samples_leaf': [1, 2, 3, 4, 5, 6]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
cross_val_score(dt_cv.best_estimator_, X_train, y_train, cv=3, scoring= "accuracy")

array([0.8013468 , 0.82491582, 0.81818182])

## Random Forest Classifier 

In [42]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(random_state=42)

In [43]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

## Determine the importance of each feature

In [45]:
feat_labels = data.columns
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(15):
    print("%2d) %-*s %f" % (f + 1, 20, feat_labels[indices[f]], importances[indices[f]]))

 1) Sex_male             0.157813
 2) Fare                 0.147661
 3) Title_Mr             0.141127
 4) Pclass               0.124686
 5) Age                  0.115360
 6) Sex_female           0.095939
 7) Family_Size          0.057634
 8) Title_Miss           0.044031
 9) SibSp                0.034840
10) CatAge               0.022750
11) Embarked_S           0.015741
12) Embarked_C           0.015378
13) Embarked_Q           0.013113
14) Parch                0.009462
15) Title_Master         0.003248


## Adaboost Classifier

In [48]:
from sklearn.ensemble import AdaBoostClassifier

In [49]:
bdt_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10, min_samples_leaf=1),
                             algorithm= "SAMME",
                             learning_rate=1,
                             n_estimators= 50,
                             random_state=0)

In [58]:
bdt_clf.fit(X_train, y_train)
y_pred = bdt_clf.predict(X_test)

In [59]:
cross_val_score(bdt_clf, X_train, y_train, cv=3, scoring= "accuracy")

array([0.77104377, 0.81144781, 0.78787879])