In [1]:
#importing necesssary libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import accuracy_score,confusion_matrix, roc_curve,roc_auc_score


import matplotlib.pyplot as plt
import seaborn as sns



import warnings
warnings.filterwarnings('ignore')


In [2]:
#reading csv file
data=pd.read_csv('https://raw.githubusercontent.com/dsrscientist/dataset1/master/titanic_train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
#to know the number of rows and columns

data.shape

(891, 12)

In [4]:
#checking the null values in the data set

data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
#dropping the columns as they do not add anything to the target value(ie, survived or not)

data=data.drop(columns=['PassengerId','Name','Ticket','Fare','Cabin'],axis=1)

In [6]:
#to re-check if columns are removed
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S
4,0,3,male,35.0,0,0,S


In [7]:
#checking the dteails of the data set

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    889 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 48.8+ KB


In [8]:
#pre-processing 
#importing simple imputer

from sklearn.impute import SimpleImputer

In [9]:
#intializing the simple imputer

imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')

In [10]:
#fill nan values of age with simple imputer 

data.Age=imputer.fit_transform(data['Age'].values.reshape(-1,1))

In [11]:
#checking the null values are filled for age column
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Sex         891 non-null object
Age         891 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Embarked    889 non-null object
dtypes: float64(1), int64(4), object(2)
memory usage: 48.8+ KB


In [12]:
#embarked is a category data so using get dummies to encode


ports=pd.get_dummies(data.Embarked,prefix='Embarked')
ports.head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [13]:
#joing the data set created using get dummies with original
data1=data.join(ports)


In [14]:

data1.shape


(891, 10)

In [15]:
#reassigning to the original variable

data=data1

In [16]:

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
Survived      891 non-null int64
Pclass        891 non-null int64
Sex           891 non-null object
Age           891 non-null float64
SibSp         891 non-null int64
Parch         891 non-null int64
Embarked      889 non-null object
Embarked_C    891 non-null uint8
Embarked_Q    891 non-null uint8
Embarked_S    891 non-null uint8
dtypes: float64(1), int64(4), object(2), uint8(3)
memory usage: 51.4+ KB


In [17]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked,Embarked_C,Embarked_Q,Embarked_S
0,0,3,male,22.0,1,0,S,0,0,1
1,1,1,female,38.0,1,0,C,1,0,0
2,1,3,female,26.0,0,0,S,0,0,1
3,1,1,female,35.0,1,0,S,0,0,1
4,0,3,male,35.0,0,0,S,0,0,1


In [18]:
#dropping the original embarked column
data.drop(['Embarked'],axis=1,inplace=True)

In [19]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S
0,0,3,male,22.0,1,0,0,0,1
1,1,1,female,38.0,1,0,1,0,0
2,1,3,female,26.0,0,0,0,0,1
3,1,1,female,35.0,1,0,0,0,1
4,0,3,male,35.0,0,0,0,0,1


In [20]:
#cecking if any null values are left

data.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age           0
SibSp         0
Parch         0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [21]:
data.shape

(891, 9)

In [22]:
#replacing categorical data with numeriacal data

data['Sex']=data['Sex'].replace({'male':1,'female':0})

In [23]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,22.0,1,0,0,0,1
1,1,1,0,38.0,1,0,1,0,0
2,1,3,0,26.0,0,0,0,0,1
3,1,1,0,35.0,1,0,0,0,1
4,0,3,1,35.0,0,0,0,0,1


In [24]:
#assigning features to X
#ASSIGNING LABEL TO y

X=data.drop(columns=['Survived'])
y=data['Survived']

In [25]:
#training data for model

x_train,x_test,y_train,y_test=train_test_split(X,y, test_size=0.25, random_state= 355)

In [26]:
 from sklearn.metrics import classification_report

In [27]:
#function to train and test data and check accuracy score

def metric_score (clf,x_train,x_test,y_train,y_test,train=True):
    if train:
        y_pred=clf.predict(x_train)
        print('\n ++++++++++++++ train result is++++++++++++ ')
        print(f"Accuracy Score: {accuracy_score(y_train,y_pred) *100:.2f}%")
        
        
    elif train ==False:
        pred=clf.predict(x_test)
        print('\n ++++++++++++++ test result is++++++++++++ ')
        print(f"Accuracy Score: {accuracy_score(y_test, pred) *100:.2f}%")
        
        print ('\n \n Test classification report \n',classification_report(y_test,pred, digits=2))

In [28]:
#intializing logistic regression

log_reg=LogisticRegression()
log_reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [29]:
#traing the logistic regression model
log_reg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [30]:
#checking accuracy score with train data
metric_score(log_reg,x_train,x_test,y_train,y_test,train=True)


 ++++++++++++++ train result is++++++++++++ 
Accuracy Score: 79.94%


In [31]:
#cecking accuracy scre with test data
#accuracy=78%
#precision=83%
#reacll=83%
#f1 score=83%


metric_score(log_reg,x_train,x_test,y_train,y_test,train=False)


 ++++++++++++++ test result is++++++++++++ 
Accuracy Score: 78.03%

 
 Test classification report 
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       145
           1       0.68      0.69      0.69        78

   micro avg       0.78      0.78      0.78       223
   macro avg       0.76      0.76      0.76       223
weighted avg       0.78      0.78      0.78       223



In [32]:
#importing a new model

from sklearn.neighbors import KNeighborsClassifier

In [33]:
#intiaizing the model

knn=KNeighborsClassifier()
knn

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [34]:
#training the data
knn.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [35]:
#checking train and test score with knn model

metric_score(log_reg,x_train,x_test,y_train,y_test,train=True)

metric_score(log_reg,x_train,x_test,y_train,y_test,train=False)


 ++++++++++++++ train result is++++++++++++ 
Accuracy Score: 79.94%

 ++++++++++++++ test result is++++++++++++ 
Accuracy Score: 78.03%

 
 Test classification report 
               precision    recall  f1-score   support

           0       0.83      0.83      0.83       145
           1       0.68      0.69      0.69        78

   micro avg       0.78      0.78      0.78       223
   macro avg       0.76      0.76      0.76       223
weighted avg       0.78      0.78      0.78       223



In [None]:
#any of the model (logistic regression or KNN can be taken)