In [1]:
import pandas as pd
import numpy  as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.shape

(891, 12)

In [5]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


# Split categorical and non catagorical

In [6]:
def split_dataset_(df):
    non_categorical_ = df.loc[:,df.dtypes != object]
    categorical_ = df.loc[:,df.dtypes == object]
    return non_categorical_,categorical_

In [7]:
split_dataset = split_dataset_(df)
non_categorical_ = split_dataset[0]
categorical_     = split_dataset[1]

In [8]:
categorical_

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,A/5 21171,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,"Allen, Mr. William Henry",male,373450,,S
...,...,...,...,...,...
886,"Montvila, Rev. Juozas",male,211536,,S
887,"Graham, Miss. Margaret Edith",female,112053,B42,S
888,"Johnston, Miss. Catherine Helen ""Carrie""",female,W./C. 6607,,S
889,"Behr, Mr. Karl Howell",male,111369,C148,C


In [9]:
def impute_nan(df, features):
    most_frequent_value_ = df[features].value_counts().sort_values(ascending = False).index[0]
    df[features].fillna(most_frequent_value_,inplace = True)

In [10]:
def categorical_features(df):
    index_ = df.loc[:,df.isnull().sum()>0].columns
    return index_
def non_categorical_features(df):
    index_ = df.loc[:,df.isnull().sum()>0].columns
    return index_

In [11]:
categorical_features     = categorical_features(categorical_)
non_categorical_features = non_categorical_features(non_categorical_)
for features in categorical_features:
    impute_nan(categorical_,features)
for features in non_categorical_features:
    impute_nan(non_categorical_,features)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [12]:
categorical_features

Index(['Cabin', 'Embarked'], dtype='object')

In [13]:
print("Catagorical has missing value: ",categorical_.isnull().sum().sum())
print("Non Catagorical has missing value: ",non_categorical_.isnull().sum().sum())

Catagorical has missing value:  0
Non Catagorical has missing value:  0


In [14]:
categorical_features

Index(['Cabin', 'Embarked'], dtype='object')

In [15]:
df = pd.concat([categorical_,non_categorical_],axis = 1)

In [16]:
df.head(5)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
0,"Braund, Mr. Owen Harris",male,A/5 21171,C23 C25 C27,S,1,0,3,22.0,1,0,7.25
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,2,1,1,38.0,1,0,71.2833
2,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,C23 C25 C27,S,3,1,3,26.0,0,0,7.925
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,4,1,1,35.0,1,0,53.1
4,"Allen, Mr. William Henry",male,373450,C23 C25 C27,S,5,0,3,35.0,0,0,8.05


In [17]:
print("data has missing value: ",df.isnull().sum().sum())

data has missing value:  0


# Using One Hot Encoding

In [18]:
df = pd.get_dummies(df, drop_first= True)

In [19]:
df.shape

(891, 1726)

In [20]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,26.0,0,0,7.925,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
df.drop('PassengerId', axis = 1, inplace = True)

In [22]:
X = df.iloc[:,2:].values
y = df.iloc[:,0].values

In [23]:
y

array([0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,

In [24]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 0)

In [25]:
adb = RandomForestClassifier()
adb.fit(x_train,y_train)
adb.score(x_test,y_test)

0.8379888268156425

In [27]:
df.drop('Survived',axis =1, inplace=True)

In [28]:
df

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)","Name_Abelson, Mr. Samuel","Name_Abelson, Mrs. Samuel (Hannah Wizosky)","Name_Adahl, Mr. Mauritz Nils Martin",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.2500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,26.0,0,0,7.9250,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,35.0,1,0,53.1000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3,35.0,0,0,8.0500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.0,0,0,13.0000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
887,1,19.0,0,0,30.0000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
888,3,24.0,1,2,23.4500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
889,1,26.0,0,0,30.0000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [30]:
X = df.iloc[:].values

In [32]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X,y)

ExtraTreesClassifier()

In [68]:
order_rank = pd.DataFrame(model.feature_importances_,columns=['Score'])

In [69]:
order_rank

Unnamed: 0,Score
0,0.042482
1,0.021693
2,0.012034
3,0.011788
4,0.020287
...,...
1719,0.001018
1720,0.000914
1721,0.000249
1722,0.002899


In [77]:
features_ = pd.DataFrame(df.columns)

In [83]:
rank_ = pd.concat([features_,order_rank],axis = 1)
rank_

Unnamed: 0,0,Score
0,Pclass,0.042482
1,Age,0.021693
2,SibSp,0.012034
3,Parch,0.011788
4,Fare,0.020287
...,...,...
1719,Cabin_F4,0.001018
1720,Cabin_G6,0.000914
1721,Cabin_T,0.000249
1722,Embarked_Q,0.002899


In [97]:
index_ = rank_.sort_values('Score',ascending=False).head(30).values

In [100]:
data = pd.DataFrame(index_)

In [103]:
important_features = data[0].values

In [105]:
important_features

array(['Sex_male', 'Pclass', 'Cabin_C23 C25 C27', 'Age', 'Fare', 'SibSp',
       'Parch', 'Embarked_S', 'Ticket_1601', 'Ticket_347082',
       'Ticket_2666', 'Ticket_29106', 'Ticket_2661', 'Ticket_C.A. 37671',
       'Ticket_347742', 'Name_Tornquist, Mr. William Henry', 'Embarked_Q',
       'Ticket_2908', 'Ticket_113760', 'Ticket_347088',
       'Name_Albimona, Mr. Nassef Cassem', 'Cabin_C52', 'Cabin_E25',
       'Ticket_2651', 'Cabin_E24', 'Ticket_2653',
       'Name_Sheerlinck, Mr. Jan Baptist', 'Name_Hedman, Mr. Oskar Arvid',
       'Name_Moss, Mr. Albert Johan', 'Ticket_367226'], dtype=object)

In [116]:
df = df.loc[:,['Sex_male', 'Pclass', 'Cabin_C23 C25 C27', 'Age', 'Fare', 'SibSp',
       'Parch', 'Embarked_S', 'Ticket_1601', 'Ticket_347082',
       'Ticket_2666', 'Ticket_29106', 'Ticket_2661', 'Ticket_C.A. 37671',
       'Ticket_347742', 'Name_Tornquist, Mr. William Henry', 'Embarked_Q',
       'Ticket_2908', 'Ticket_113760', 'Ticket_347088',
       'Name_Albimona, Mr. Nassef Cassem', 'Cabin_C52', 'Cabin_E25',
       'Ticket_2651', 'Cabin_E24', 'Ticket_2653',
       'Name_Sheerlinck, Mr. Jan Baptist', 'Name_Hedman, Mr. Oskar Arvid',
       'Name_Moss, Mr. Albert Johan', 'Ticket_367226']]

In [None]:
X = df.

# Using Label Encoding

In [None]:
data = pd.concat([categorical_,non_categorical_],axis = 1)

In [None]:
labelEncoder_ = LabelEncoder()
data = data.apply(LabelEncoder().fit_transform)

In [None]:
X = data.iloc[:,1:].values
y = data.loc[:,'Survived'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.30, random_state = 0)

In [None]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)

# Using Frequency Encoding

In [None]:
data = pd.concat([categorical_,non_categorical_],axis = 1)
data1 = data

In [None]:
data.isnull().sum()

In [None]:
categorical_

In [None]:
categorical_features

In [None]:
#encoder_dict_ = {}
#for var in categorical_features:
#    encoder_dict_[var] = (data[categorical_features].value_counts()).to_dict()

In [None]:
data_= (data['Cabin'].value_counts()).to_dict()
data__= (data['Embarked'].value_counts()).to_dict()
data1__ = (data['Sex'].value_counts()).to_dict()
data2__ = (data['Ticket'].value_counts()).to_dict()

In [None]:
data['Cabin'] = data['Cabin'].map(data_)
data['Embarked'] = data['Embarked'].map(data__)
data['Sex'] = data['Sex'].map(data1__)
data['Ticket'] = data['Ticket'].map(data2__)

In [None]:
data1 = data.head(5)

In [None]:
data1

In [None]:
target = data1['Survived']
data1.drop('Survived',axis = 1, inplace= True)
data1.drop('Name',axis = 1, inplace= True)

In [None]:
target = data1['Survived']
data1.drop('Survived',axis = 1, inplace= True)
data1.drop('Name',axis = 1, inplace= True)

In [None]:
data1

In [None]:
X = data1.iloc[:]

In [None]:
y = target

In [None]:
y

In [None]:
from feature_engine.encoding import WoEEncoder, RareLabelEncoder
rare_encoder = RareLabelEncoder(tol=0.03, n_categories=2, variables=['cabin', 'pclass', 'embarked'])
X = rare_encoder.fit_transform(X)

In [None]:
data

In [None]:
Fare  = data['Fare']
data['Survived'] = data['Fare']
data['Fare'] = df['Survived']

In [None]:
data

In [None]:
X = data.iloc[:,:-1].values
y = data.iloc[:,-1].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 0)

In [None]:
dt = AdaBoostClassifier()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)