Important Libraries

In [1]:
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

### Read Data the file is Excel

In [18]:
data= pd.read_excel("/content/titanic.xls",encoding='utf-8')
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Determining Missing Data and dealing With it

In [19]:
def define_Missing():
  print("The shape of Data before Deleting",data.shape )
  total= data.isnull().sum()
  percent= ((total/len(data.index))*100).round(2)
  data1= pd.concat([total,percent],axis=1,keys=['missing','%'])
  return data1
define_Missing().head()

The shape of Data before Deleting (1309, 14)


Unnamed: 0,missing,%
pclass,0,0.0
survived,0,0.0
name,0,0.0
sex,0,0.0
age,263,20.09


#### from the result above the are filled Nan Values more than 50% it must delete these coulmns boat,body  

In [20]:
data.drop(['body','boat','home.dest','cabin','ticket'],axis=1,inplace=True)

#### embarked  has two Nan values and we want to fill itwe want to know which value most freqrncy

In [21]:
data['embarked'].value_counts()
data['embarked'].replace(np.nan,'S',inplace=True)
data['embarked']= data['embarked'].replace({'S':0,'C':1,'Q':2})                   
data['sex']= data['sex'].replace({'female':1,'male':0})                   

In [22]:
data.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,fare,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",1,29.0,0,0,211.3375,0
1,1,1,"Allison, Master. Hudson Trevor",0,0.9167,1,2,151.55,0
2,1,0,"Allison, Miss. Helen Loraine",1,2.0,1,2,151.55,0
3,1,0,"Allison, Mr. Hudson Joshua Creighton",0,30.0,1,2,151.55,0
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0,1,2,151.55,0


#### Deal with missing Values in the age Column

In [24]:
age_avg= data['age'].mean()
age_std= data['age'].std()
age_null_count = data['age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
data['age'][np.isnan(data['age'])] = age_null_random_list
data['age'] = data['age'].astype(int)

In [25]:
data['bins_age'] = pd.cut(data['age'], 5)
data[['bins_age', 'survived']].groupby('bins_age').mean()

Unnamed: 0_level_0,survived
bins_age,Unnamed: 1_level_1
"(-0.08, 16.0]",0.533333
"(16.0, 32.0]",0.344928
"(32.0, 48.0]",0.38
"(48.0, 64.0]",0.443396
"(64.0, 80.0]",0.153846


In [26]:
data['fare']=data['fare'].replace(np.nan,data['fare'].median())

### we will split Fare to four bins by using cut(Numrical to catogrical)

In [27]:
data['fare'] = pd.cut(data['fare'],4)
data.drop('name',inplace=True,axis=1)

In [28]:
from sklearn.preprocessing import LabelEncoder
model=LabelEncoder()
data['age']=model.fit_transform(data['bins_age'])
data['fare']=model.fit_transform(data['fare'])
data.drop('bins_age',inplace=True,axis=1)

##  fit the Model and Test The Accurecy

In [29]:
X=data.drop(['survived'],axis=1)
y=data['survived']

In [30]:

X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=.2,random_state=33)

### Compear Between the Diferent Model 

In [31]:
#Random forest
RFmodel= RandomForestClassifier(n_estimators=100,max_depth=3,random_state=33)
RFmodel.fit(X_train,y_train)
y_pred= RFmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for Random Forest:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
###DecisionTree
DTmodel= DecisionTreeClassifier(max_depth=5,random_state=33)
DTmodel.fit(X_train,y_train)
y_pred= DTmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for Decision Tree:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
####SVC
SVCmodel= SVC(kernel= 'rbf')
SVCmodel.fit(X_train,y_train)
y_pred= SVCmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for SVC:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
####Logistic Regression
LRmodel= LogisticRegression(fit_intercept=True)
LRmodel.fit(X_train,y_train)
y_pred= LRmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for Logestic Regretion:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
###bernoli NB
BNmodel= BernoulliNB(alpha=.01)
BNmodel.fit(X_train,y_train)
y_pred= BNmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for Bernoulli:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
####KNeighbors
NeighborsClassmodel= KNeighborsClassifier(n_neighbors=4)
NeighborsClassmodel.fit(X_train,y_train)
y_pred= NeighborsClassmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for KNeighbors:%f"%accuracy_score(y_pred,y_test))
print('*'*50)
####LinearDiscriminant
LinearDiscriminantmodel= LinearDiscriminantAnalysis(n_components=10)
LinearDiscriminantmodel.fit(X_train,y_train)
y_pred= LinearDiscriminantmodel.predict(X_test)
print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
print("Accurecy Score for LinearDiscriminant:%f"%accuracy_score(y_pred,y_test))
print('*'*50)

confusion matrix:
 [[150  18]
 [ 24  70]]
Accurecy Score for Random Forest:0.839695
**************************************************
confusion matrix:
 [[144  24]
 [ 25  69]]
Accurecy Score for Decision Tree:0.812977
**************************************************
confusion matrix:
 [[143  25]
 [ 24  70]]
Accurecy Score for SVC:0.812977
**************************************************
confusion matrix:
 [[139  29]
 [ 25  69]]
Accurecy Score for Logestic Regretion:0.793893
**************************************************
confusion matrix:
 [[129  39]
 [ 22  72]]
Accurecy Score for Bernoulli:0.767176
**************************************************
confusion matrix:
 [[147  21]
 [ 29  65]]
Accurecy Score for KNeighbors:0.809160
**************************************************
confusion matrix:
 [[139  29]
 [ 25  69]]
Accurecy Score for LinearDiscriminant:0.793893
**************************************************


## Select the Best estimator in Randomforest 

In [32]:
for i in range(2,200,5):
    print("n:",i)
    RFmodel= RandomForestClassifier(n_estimators=i,max_depth=5,random_state=33)
    RFmodel.fit(X_train,y_train)
    y_pred= RFmodel.predict(X_test)
    print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
    print("Accurecy Score for Random Forest:%f"%accuracy_score(y_pred,y_test))
    print('*'*50)
#####the best estimator n =2 the Accuecy is 0.835878    

n: 2
confusion matrix:
 [[151  17]
 [ 33  61]]
Accurecy Score for Random Forest:0.809160
**************************************************
n: 7
confusion matrix:
 [[146  22]
 [ 25  69]]
Accurecy Score for Random Forest:0.820611
**************************************************
n: 12
confusion matrix:
 [[153  15]
 [ 34  60]]
Accurecy Score for Random Forest:0.812977
**************************************************
n: 17
confusion matrix:
 [[150  18]
 [ 24  70]]
Accurecy Score for Random Forest:0.839695
**************************************************
n: 22
confusion matrix:
 [[146  22]
 [ 23  71]]
Accurecy Score for Random Forest:0.828244
**************************************************
n: 27
confusion matrix:
 [[144  24]
 [ 23  71]]
Accurecy Score for Random Forest:0.820611
**************************************************
n: 32
confusion matrix:
 [[144  24]
 [ 23  71]]
Accurecy Score for Random Forest:0.820611
**************************************************
n: 37
confusion

In [33]:
 ####KNeighbors
for i in range(2,10,1):
    print("Neighbors:",i)
    NeighborsClassmodel= KNeighborsClassifier(n_neighbors=i,algorithm='kd_tree',
                                              weights ='distance')
    NeighborsClassmodel.fit(X_train,y_train)
    y_pred= NeighborsClassmodel.predict(X_test)
    print("confusion matrix:\n",confusion_matrix(y_test,y_pred))
    print("Accurecy Score for KNeighbors:%f"%accuracy_score(y_pred,y_test))
    print('*'*50)

#### the best paramters (n=4,algorithm='kd_tree',weights ='distance')     
    

Neighbors: 2
confusion matrix:
 [[155  13]
 [ 32  62]]
Accurecy Score for KNeighbors:0.828244
**************************************************
Neighbors: 3
confusion matrix:
 [[137  31]
 [ 21  73]]
Accurecy Score for KNeighbors:0.801527
**************************************************
Neighbors: 4
confusion matrix:
 [[147  21]
 [ 26  68]]
Accurecy Score for KNeighbors:0.820611
**************************************************
Neighbors: 5
confusion matrix:
 [[143  25]
 [ 24  70]]
Accurecy Score for KNeighbors:0.812977
**************************************************
Neighbors: 6
confusion matrix:
 [[149  19]
 [ 25  69]]
Accurecy Score for KNeighbors:0.832061
**************************************************
Neighbors: 7
confusion matrix:
 [[147  21]
 [ 25  69]]
Accurecy Score for KNeighbors:0.824427
**************************************************
Neighbors: 8
confusion matrix:
 [[148  20]
 [ 25  69]]
Accurecy Score for KNeighbors:0.828244
************************************