In [99]:
import  pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import roc_auc_score

from sklearn.model_selection import train_test_split

## Loading the dataset

In [100]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [101]:
data.shape

(891, 12)

In [102]:
data['Name'].nunique()

891

In [103]:
data['Sex'].nunique()

2

In [104]:
data['Cabin'].nunique()

147

In [105]:
df = data[['Sex','Cabin','Survived']]

In [106]:
df

Unnamed: 0,Sex,Cabin,Survived
0,male,,0
1,female,C85,1
2,female,,1
3,female,C123,1
4,male,,0
...,...,...,...
886,male,,0
887,female,B42,1
888,female,,0
889,male,C148,1


### Recognising the different unique values in the columns

In [107]:
df['Cabin'].str[0].nunique()

8

In [108]:
#recognising the different unique values in the columns "Cabin"  with str[0] : means with the help of first index of the string it contains

df['Cabin'].str[0].unique()

array([nan, 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

### Filling the nan values

In [109]:
#filling the nan values with 'n' using the fillna()

df['Cabin'].str[0].fillna('n').unique()

array(['n', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [110]:
df['Cabin_reduced'] = df['Cabin'].str[0].fillna('n')
df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Sex,Cabin,Survived,Cabin_reduced
0,male,,0,n
1,female,C85,1,C
2,female,,1,n
3,female,C123,1,C
4,male,,0,n
5,male,,0,n
6,male,E46,0,E
7,male,,0,n
8,female,,1,n
9,female,,1,n


In [111]:
#Checking the cardinality:

df['Sex'].nunique(),df['Cabin'].nunique(),df['Cabin_reduced'].nunique()

(2, 147, 9)

## Train Test Split 

In [112]:
use_cols = ['Cabin','Cabin_reduced','Sex']
x_train,x_test,y_train,y_test = train_test_split(df[use_cols],df['Survived'],test_size = 0.4, random_state = 0)

In [113]:
x_train.shape,x_test.shape

((534, 3), (357, 3))

In [114]:
train_cabin_unique = x_train['Cabin'].unique()
test_cabin_unique = x_test['Cabin'].unique()
train_cabin_unique,test_cabin_unique

(array([nan, 'E67', 'C126', 'B73', 'E36', 'C78', 'E46', 'C111', 'E101',
        'D15', 'E12', 'G6', 'A32', 'B4', 'A10', 'A5', 'C95', 'E25', 'C90',
        'D6', 'A36', 'D', 'D26', 'D50', 'B96 B98', 'C93', 'E77', 'C101',
        'D11', 'C123', 'C32', 'B35', 'C91', 'T', 'B101', 'E58', 'A23',
        'B77', 'D28', 'B82 B84', 'B79', 'E44', 'C45', 'C2', 'B5', 'C104',
        'B20', 'A19', 'B51 B53 B55', 'B80', 'B38', 'B22', 'B18', 'C22 C26',
        'E8', 'A16', 'F2', 'D47', 'E121', 'D33', 'C23 C25 C27', 'B28',
        'E10', 'D36', 'C46', 'B39', 'D30', 'E33', 'C50', 'D20', 'C124',
        'A34', 'C110', 'D19', 'B86', 'D35', 'C99', 'D46', 'F38', 'A24',
        'D7', 'C65', 'C103', 'A31', 'C82', 'F G63', 'A6', 'E50', 'C68',
        'C52', 'B3', 'B49', 'C118', 'C70', 'F E69', 'F4', 'C86', 'D48',
        'C49', 'C92', 'A7', 'A20', 'E24', 'E49'], dtype=object),
 array([nan, 'B78', 'C106', 'C125', 'C7', 'B49', 'C54', 'E34', 'C52',
        'B50', 'B41', 'C83', 'B57 B59 B63 B66', 'E67', 'D45', 'D1

### Cabin

In [115]:
len(train_cabin_unique) - len(test_cabin_unique)

35

In [116]:
 A = [a for a in train_cabin_unique if a not in test_cabin_unique]

In [117]:
len(A), type(A)

(80, list)

In [118]:
B = [a for a in test_cabin_unique if a not in train_cabin_unique]

In [119]:
len(B),type(B)

(45, list)

### Cabin Reduced

In [120]:
A_r = [a for a in x_train['Cabin_reduced'].unique() if a not in x_test['Cabin_reduced'].unique()]
len(A_r)

1

In [121]:
B_r = [a for a in x_test['Cabin_reduced'].unique() if a not in x_train['Cabin_reduced'].unique()]
len(B_r)

0

## Categorical encoding

In [122]:
df

Unnamed: 0,Sex,Cabin,Survived,Cabin_reduced
0,male,,0,n
1,female,C85,1,C
2,female,,1,n
3,female,C123,1,C
4,male,,0,n
...,...,...,...,...
886,male,,0,n
887,female,B42,1,B
888,female,,0,n
889,male,C148,1,C


In [123]:
x_train.isnull().sum(),x_test.isnull().sum()

(Cabin            407
 Cabin_reduced      0
 Sex                0
 dtype: int64,
 Cabin            280
 Cabin_reduced      0
 Sex                0
 dtype: int64)

### Filling the null values

In [124]:
x_train.fillna('0',inplace = True)
x_test.fillna('0',inplace = True)

In [125]:
x_train.isnull().sum(),x_test.isnull().sum()

(Cabin            0
 Cabin_reduced    0
 Sex              0
 dtype: int64,
 Cabin            0
 Cabin_reduced    0
 Sex              0
 dtype: int64)

In [126]:
x_train,x_test

(    Cabin Cabin_reduced     Sex
 100     0             n  female
 722     0             n    male
 678     0             n  female
 229     0             n  female
 334     0             n  female
 ..    ...           ...     ...
 835   E49             E  female
 192     0             n  female
 629     0             n    male
 559     0             n  female
 684     0             n    male
 
 [534 rows x 3 columns],
     Cabin Cabin_reduced     Sex
 495     0             n    male
 648     0             n    male
 278     0             n    male
 31    B78             B  female
 255     0             n  female
 ..    ...           ...     ...
 363     0             n    male
 406     0             n    male
 866     0             n  female
 881     0             n    male
 618    F4             F  female
 
 [357 rows x 3 columns])

['Cabin','Sex']

['Cabin_reduced','Sex']

In [127]:
#Now we encode categorically

train = pd.get_dummies(x_train[['Cabin','Sex']],columns = ['Cabin','Sex'])

In [128]:
test = pd.get_dummies(x_test[['Cabin','Sex']],columns = ['Cabin','Sex'])

In [129]:
train.shape,test.shape

((534, 106), (357, 71))

### Balance the cols of the train and test

In [130]:
missing_col = set(train.columns) - set(test.columns)

In [131]:
missing_col

{'Cabin_A10',
 'Cabin_A16',
 'Cabin_A19',
 'Cabin_A20',
 'Cabin_A23',
 'Cabin_A24',
 'Cabin_A31',
 'Cabin_A32',
 'Cabin_A34',
 'Cabin_A36',
 'Cabin_A5',
 'Cabin_A6',
 'Cabin_A7',
 'Cabin_B101',
 'Cabin_B20',
 'Cabin_B22',
 'Cabin_B28',
 'Cabin_B3',
 'Cabin_B35',
 'Cabin_B38',
 'Cabin_B39',
 'Cabin_B4',
 'Cabin_B5',
 'Cabin_B51 B53 B55',
 'Cabin_B73',
 'Cabin_B77',
 'Cabin_B79',
 'Cabin_B80',
 'Cabin_B82 B84',
 'Cabin_B86',
 'Cabin_C101',
 'Cabin_C103',
 'Cabin_C104',
 'Cabin_C110',
 'Cabin_C111',
 'Cabin_C118',
 'Cabin_C123',
 'Cabin_C2',
 'Cabin_C22 C26',
 'Cabin_C32',
 'Cabin_C45',
 'Cabin_C46',
 'Cabin_C49',
 'Cabin_C50',
 'Cabin_C70',
 'Cabin_C82',
 'Cabin_C86',
 'Cabin_C90',
 'Cabin_C91',
 'Cabin_C93',
 'Cabin_C95',
 'Cabin_C99',
 'Cabin_D',
 'Cabin_D11',
 'Cabin_D15',
 'Cabin_D19',
 'Cabin_D28',
 'Cabin_D30',
 'Cabin_D35',
 'Cabin_D46',
 'Cabin_D47',
 'Cabin_D48',
 'Cabin_D50',
 'Cabin_D6',
 'Cabin_D7',
 'Cabin_E10',
 'Cabin_E101',
 'Cabin_E12',
 'Cabin_E25',
 'Cabin_E36',
 'Cabi

In [132]:
for c in missing_col:
    test[c] = 0

In [133]:
train.columns

Index(['Cabin_0', 'Cabin_A10', 'Cabin_A16', 'Cabin_A19', 'Cabin_A20',
       'Cabin_A23', 'Cabin_A24', 'Cabin_A31', 'Cabin_A32', 'Cabin_A34',
       ...
       'Cabin_E8', 'Cabin_F E69', 'Cabin_F G63', 'Cabin_F2', 'Cabin_F38',
       'Cabin_F4', 'Cabin_G6', 'Cabin_T', 'Sex_female', 'Sex_male'],
      dtype='object', length=106)

In [134]:
test = test[train.columns]

In [135]:
train.shape,test.shape

((534, 106), (357, 106))

## Using The RandomForestClassifier

In [136]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [137]:
y_train_pred = model.predict_proba(train)
y_test_pred = model.predict_proba(test)

In [138]:
y_train_pred,y_test_pred

(array([[0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773],
        [0.3296787 , 0.6703213 ],
        ...,
        [0.86457227, 0.13542773],
        [0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773]]),
 array([[0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.01      , 0.99      ],
        [0.3296787 , 0.6703213 ],
        [0.84625439, 0.15374561],
        [0.01      , 0.99      ],
        [0.01      , 0.99      ],
        [0.82125439, 0.17874561],
        [0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773],
        [0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773],
        [0.3296787 , 0.6703213 ],
        [0.01      , 0.99      ],
        [0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.86457227, 0.13542773],
        [0.3296787 , 0.6703213 ],
        [0.86457227, 0.13542773],