In [26]:
import pandas as pd
from category_encoders import BinaryEncoder, OrdinalEncoder
import sklearn

In [2]:
df = pd.read_csv(r'data\titanic.csv')

In [3]:
df = df.set_index('PassengerId')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [5]:
df = df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

In [6]:
# use ordinal encoding to encode two or three categorical features
enc = OrdinalEncoder(cols=['Sex', 'Embarked']).fit(df)

# transform the dataset
df = enc.transform(df)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int32  
dtypes: float64(2), int32(2), int64(4)
memory usage: 55.7 KB


In [7]:
df=df.fillna(df.mean())

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 1 to 891
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Parch     891 non-null    int64  
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    int32  
dtypes: float64(2), int32(2), int64(4)
memory usage: 55.7 KB


In [29]:
df.groupby('Survived').size()

Survived
0    549
1    342
dtype: int64

In [30]:
pivot = pd.pivot_table(df, 
                      index = 'Survived', columns = 'Sex', values = 'Age', aggfunc = 'count', margins = True)
pivot

Sex,1,2,All
Survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,468,81,549
1,109,233,342
All,577,314,891


In [9]:
y = df['Survived'].values

In [10]:
features = df.loc[:, df.columns != 'Survived']

In [11]:
X = features.values

In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
model = LogisticRegression()

In [20]:
model.fit(X_train, y_train)

In [21]:
model.score(X_test, y_test)

0.7937219730941704

In [22]:
model.score(X_train, y_train)

0.8098802395209581

In [23]:
coeff = pd.DataFrame(model.coef_, columns = features.columns)
coeff = coeff.T
coeff.columns =['coefficient']
coeff.sort_values(by = 'coefficient')

Unnamed: 0,coefficient
Pclass,-1.023526
SibSp,-0.339539
Parch,-0.122155
Age,-0.032441
Fare,0.002551
Embarked,0.285887
Sex,2.679961


In [24]:
y_pred = model.predict(X)
probs = model.predict_proba(X)
probs 

array([[0.91285718, 0.08714282],
       [0.09045035, 0.90954965],
       [0.36762283, 0.63237717],
       ...,
       [0.53041826, 0.46958174],
       [0.43740185, 0.56259815],
       [0.85331604, 0.14668396]])

In [27]:
sklearn.metrics.accuracy_score(y, y_pred)

0.8058361391694725

In [28]:
conf_matrix = pd.DataFrame(
    sklearn.metrics.confusion_matrix(y, y_pred), 
    columns=['Forecast', 'Forecast'],
    index=['Actual', 'Actual'])
conf_matrix

Unnamed: 0,Forecast,Forecast.1
Actual,481,68
Actual,105,237


In [31]:
# correct % of all = accuracy_score
(conf_matrix.iloc[0, 0] + conf_matrix.iloc[1, 1] ) / conf_matrix.values.sum()

0.8058361391694725

In [32]:
# % of all
conf_matrix.values / conf_matrix.values.sum()

array([[0.53984287, 0.07631874],
       [0.11784512, 0.26599327]])

In [33]:
# actual surv
conf_matrix.sum(axis=1)

Actual    549
Actual    342
dtype: int64

In [34]:
# predicted surv
conf_matrix.sum(axis=0)

Forecast    586
Forecast    305
dtype: int64

In [35]:
model.score(X_test, y_test)

0.7937219730941704

In [36]:
model.score(X_train, y_train)

0.8098802395209581