# Feature Construction
***Feature construction involves transforming a given set of input features to generate a new set of more power- ful features which are then used for prediction. This may be done either to compress the dataset by reducing the number of features or to improve the prediction performance.***

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/home/saad/Downloads/tested.csv', usecols=['Age','Pclass','Parch','SibSp','Survived'])
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,34.5,0,0
1,1,3,47.0,1,0
2,0,2,62.0,0,0
3,0,3,27.0,0,0
4,1,3,22.0,1,1


In [3]:
df.shape

(418, 5)

In [4]:
df['Age'].fillna(df['Age'].median(), inplace=True)

In [5]:
df.sample(5)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
308,0,1,55.0,1,1
329,0,2,21.0,0,0
237,0,3,20.0,0,0
361,1,2,24.0,1,1
36,1,3,27.0,0,0


In [6]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

In [7]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch
0,3,34.5,0,0
1,3,47.0,1,0
2,2,62.0,0,0
3,3,27.0,0,0
4,3,22.0,1,1


In [8]:
np.mean(cross_val_score(LogisticRegression(),X,y, scoring='accuracy', cv=20))

0.643690476190476

### Applying Feature Construction

In [9]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1 # 1 is because i also wanna add the pessenger in the family

In [10]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,34.5,0,0,1
1,3,47.0,1,0,2
2,2,62.0,0,0,1
3,3,27.0,0,0,1
4,3,22.0,1,1,3


In [11]:
def myfunc(num):
    if num == 1:
        # Alone
        return 0
    elif num > 1 and num <= 4:
        # Small Family
        return 1
    else:
        # Large Family
        return 2

In [12]:
myfunc(4)

1

In [13]:
X['Family_type'] = X['Family_size'].apply(myfunc)

In [14]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,34.5,0,0,1,0
1,3,47.0,1,0,2,1
2,2,62.0,0,0,1,0
3,3,27.0,0,0,1,0
4,3,22.0,1,1,3,1


In [15]:
X.drop(columns=['SibSp','Parch','Family_size'], inplace=True)

In [16]:
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,34.5,0
1,3,47.0,1
2,2,62.0,0
3,3,27.0,0
4,3,22.0,1


In [17]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))

0.6361904761904762

# Feature Splitting

In [18]:
df = pd.read_csv('/home/saad/Downloads/tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [19]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]

In [20]:
df[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Kelly, Mr. James"
1,Mrs,"Wilkes, Mrs. James (Ellen Needs)"
2,Mr,"Myles, Mr. Thomas Francis"
3,Mr,"Wirz, Mr. Albert"
4,Mrs,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)"
...,...,...
413,Mr,"Spector, Mr. Woolf"
414,Dona,"Oliva y Ocana, Dona. Fermina"
415,Mr,"Saether, Mr. Simon Sivertsen"
416,Mr,"Ware, Mr. Frederick"


In [24]:
(df.groupby('Title').mean()['Survived']).sort_values(ascending=False)

Title
Dona      1.0
Miss      1.0
Mrs       1.0
Ms        1.0
Col       0.0
Dr        0.0
Master    0.0
Mr        0.0
Rev       0.0
Name: Survived, dtype: float64

In [25]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title']=='Mrs'] = 1 # if the value of title is Mrs then do the value for is_married is 1

In [26]:
df['Is_Married']

0      0
1      1
2      0
3      0
4      1
      ..
413    0
414    0
415    0
416    0
417    0
Name: Is_Married, Length: 418, dtype: int64