In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
df =pd.read_csv('train.csv' , usecols=['Age','Pclass','SibSp','Parch', 'Survived'])

In [10]:
df.isna().mean() *100

Survived    0.0
Pclass      0.0
Age         0.0
SibSp       0.0
Parch       0.0
dtype: float64

In [9]:
df.dropna(inplace=True)

In [11]:
df.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch
0,0,3,22.0,1,0
1,1,1,38.0,1,0
2,1,3,26.0,0,0
3,1,1,35.0,1,0
4,0,3,35.0,0,0


In [12]:
X = df.iloc[:,1:]
y=df.iloc[:,0]

In [14]:
np.mean(cross_val_score(LogisticRegression(),X,y,cv=20 ,scoring ='accuracy'))

0.6933333333333332

## Applying Feature Construction 

In [15]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1

In [16]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size
0,3,22.0,1,0,2
1,1,38.0,1,0,2
2,3,26.0,0,0,1
3,1,35.0,1,0,2
4,3,35.0,0,0,1


In [18]:
def my_family(num):
    if num == 1:
        #alone
        return 0
    elif num>1 and num <=4:
        #small family
        return 1
    else:
        #large family
        return 2

In [23]:
my_family(1)

0

In [29]:
X['Family_type'] = X['Family_size'].apply(my_family)

In [30]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Family_size,Family_type
0,3,22.0,1,0,1,0
1,1,38.0,1,0,1,0
2,3,26.0,0,0,0,2
3,1,35.0,1,0,1,0
4,3,35.0,0,0,0,2


In [32]:
X.drop(columns =['SibSp','Parch','Family_size'] ,inplace =True)

In [33]:
X.head()

Unnamed: 0,Pclass,Age,Family_type
0,3,22.0,0
1,1,38.0,0
2,3,26.0,2
3,1,35.0,0
4,3,35.0,2


In [34]:
np.mean(cross_val_score(LogisticRegression(),X,y,cv=20 , scoring= 'accuracy'))

0.7312698412698414

## Feature Splitting

In [35]:
df =pd.read_csv('train.csv')

In [38]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
193,194,1,2,"Navratil, Master. Michel M",male,3.0,1,1,230080,26.0,F2,S
224,225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38.0,1,0,19943,90.0,C93,S
257,258,1,1,"Cherry, Miss. Gladys",female,30.0,0,0,110152,86.5,B77,S
128,129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C
174,175,0,1,"Smith, Mr. James Clinch",male,56.0,0,0,17764,30.6958,A7,C


In [39]:
df['Name']

0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
                             ...                        
886                                Montvila, Rev. Juozas
887                         Graham, Miss. Margaret Edith
888             Johnston, Miss. Catherine Helen "Carrie"
889                                Behr, Mr. Karl Howell
890                                  Dooley, Mr. Patrick
Name: Name, Length: 891, dtype: object

In [47]:
df['Title'] = df['Name'].str.split(',',expand = True)[1].str.split('.',expand = True)[0]

In [48]:
df[['Title','Name']]

Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
...,...,...
886,Rev,"Montvila, Rev. Juozas"
887,Miss,"Graham, Miss. Margaret Edith"
888,Miss,"Johnston, Miss. Catherine Helen ""Carrie"""
889,Mr,"Behr, Mr. Karl Howell"


In [52]:
df.groupby('Title').mean()['Survived'].sort_values(ascending=False)

Title
 the Countess    1.000000
 Mlle            1.000000
 Lady            1.000000
 Ms              1.000000
 Sir             1.000000
 Mme             1.000000
 Mrs             0.792000
 Miss            0.697802
 Master          0.575000
 Major           0.500000
 Col             0.500000
 Dr              0.428571
 Mr              0.156673
 Rev             0.000000
 Jonkheer        0.000000
 Don             0.000000
 Capt            0.000000
Name: Survived, dtype: float64

In [53]:
df['Title'].value_counts()

 Mr              517
 Miss            182
 Mrs             125
 Master           40
 Dr                7
 Rev               6
 Mlle              2
 Col               2
 Major             2
 Capt              1
 Lady              1
 Sir               1
 Ms                1
 Don               1
 Jonkheer          1
 the Countess      1
 Mme               1
Name: Title, dtype: int64

In [55]:
df['is_Married'] = 0
df['is_Married'].loc[df['Title'] == 'Mrs'] = 1