In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd /content/drive/My Drive/BigData/

import numpy as np
import pandas as pd
import sklearn.ensemble
import sklearn.preprocessing
import sklearn.decomposition
from sklearn.model_selection import train_test_split

Mounted at /content/drive
/content/drive/My Drive/BigData


In [2]:
df = pd.read_csv('data/hepatitis.csv')

In [3]:
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85.0,18.0,4.0,,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135.0,42.0,3.5,,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96.0,32.0,4.0,,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46.0,52.0,4.0,80.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200.0,4.0,,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242.0,3.3,50.0,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126.0,142.0,4.3,,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75.0,20.0,4.1,,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81.0,19.0,4.1,48.0,True,live


In [4]:
print(df.dtypes)
df = df.convert_dtypes()
print(df.dtypes)

age                  int64
sex                 object
steroid             object
antivirals            bool
fatigue             object
malaise             object
anorexia            object
liver_big           object
liver_firm          object
spleen_palpable     object
spiders             object
ascites             object
varices             object
bilirubin          float64
alk_phosphate      float64
sgot               float64
albumin            float64
protime            float64
histology             bool
class               object
dtype: object
age                  Int64
sex                 string
steroid            boolean
antivirals         boolean
fatigue            boolean
malaise            boolean
anorexia           boolean
liver_big          boolean
liver_firm         boolean
spleen_palpable    boolean
spiders            boolean
ascites            boolean
varices            boolean
bilirubin          Float64
alk_phosphate        Int64
sgot                 Int64
albumin       

In [5]:
df.isna().sum()

age                 0
sex                 0
steroid             1
antivirals          0
fatigue             1
malaise             1
anorexia            1
liver_big          10
liver_firm         11
spleen_palpable     5
spiders             5
ascites             5
varices             5
bilirubin           6
alk_phosphate      29
sgot                4
albumin            16
protime            67
histology           0
class               0
dtype: int64

In [6]:
df.isna().sum() / len(df) * 100

age                 0.000000
sex                 0.000000
steroid             0.645161
antivirals          0.000000
fatigue             0.645161
malaise             0.645161
anorexia            0.645161
liver_big           6.451613
liver_firm          7.096774
spleen_palpable     3.225806
spiders             3.225806
ascites             3.225806
varices             3.225806
bilirubin           3.870968
alk_phosphate      18.709677
sgot                2.580645
albumin            10.322581
protime            43.225806
histology           0.000000
class               0.000000
dtype: float64

# Clean the Dataset

## Case 1: Drop variables with missing values

In [7]:
# df.dropna(axis=1) # Drop all columns (axis=1) that have at least 1 NA
df.dropna(thresh=.80*len(df), axis=1) # Use a threshold

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,False,live
3,31,female,,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,,200,4.0,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,,242,3.3,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,True,live


## Case 2: Drop samples with NA

In [8]:
df.dropna(axis=0)

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
5,34,female,True,False,False,False,False,True,False,False,False,False,False,0.9,95,28,4.0,75,False,live
10,39,female,False,True,False,False,False,False,True,False,False,False,False,1.3,78,30,4.4,85,False,live
11,32,female,True,True,True,False,False,True,True,False,True,False,False,1.0,59,249,3.7,54,False,live
12,41,female,True,True,True,False,False,True,True,False,False,False,False,0.9,81,60,3.9,52,False,live
13,30,female,True,False,True,False,False,True,True,False,False,False,False,2.2,57,144,4.9,78,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,45,female,True,True,False,False,False,True,False,False,False,False,False,1.3,85,44,4.2,85,True,live
143,49,female,False,False,True,True,False,True,False,True,True,False,False,1.4,85,70,3.5,35,True,die
145,31,female,False,False,True,False,False,True,False,False,False,False,False,1.2,75,173,4.2,54,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


## Case 3 (Preferred)

In [9]:
# fillna(value)

## Case 4: Insert generated data

In [10]:
## Boolean

In [11]:
boolean_cols = df.select_dtypes(include=bool).columns
boolean_cols

Index(['steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 'liver_big',
       'liver_firm', 'spleen_palpable', 'spiders', 'ascites', 'varices',
       'histology'],
      dtype='object')

In [12]:
most_frequent_value = df[boolean_cols].mode().iloc[0]
most_frequent_value
df[boolean_cols] = df[boolean_cols].fillna(most_frequent_value)

In [13]:
## Integer

In [14]:
int_columns = df.select_dtypes(include=int).columns
print(int_columns)
median_of_int = df[int_columns].median()
df[int_columns] = df[int_columns].fillna(median_of_int)
df

Index(['age', 'alk_phosphate', 'sgot', 'protime'], dtype='object')


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [15]:
## Float
float_columns = df.select_dtypes(include=float).columns
print(float_columns)
median_of_float = df[float_columns].median()

df[float_columns] = df[float_columns].astype(float)
df[float_columns] = df[float_columns].interpolate(method='linear')
df

Index(['bilirubin', 'albumin'], dtype='object')


Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,live
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,live
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,live
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,live
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,live
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,die
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,live
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,live
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,live


In [16]:
## Categorical

In [17]:
categorical_columns = df.select_dtypes(include='string').columns
print(categorical_columns)
most_frequent_value = df[categorical_columns].mode()

df[categorical_columns] = df[categorical_columns].fillna(most_frequent_value)

Index(['sex', 'class'], dtype='object')


In [18]:
# False -> no Null values in df. True -> there are Null values
df.isnull().values.any()

False

# Encode the features

In [19]:
le = sklearn.preprocessing.LabelEncoder()

df['class'] = le.fit_transform(df['class'])
df

Unnamed: 0,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class
0,30,male,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,1
1,50,female,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,1
2,78,female,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,1
3,31,female,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,1
4,34,female,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,female,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,0
151,44,female,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,1
152,61,female,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,1
153,53,male,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,1


In [20]:
## Categorical

categorical_features = df.select_dtypes(include='string').columns.tolist()
print(categorical_features)

# One-hot encoding
df = pd.get_dummies(df, prefix=categorical_features)
df

['sex']


Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
0,30,False,False,False,False,False,False,False,False,False,...,False,1.0,85,18,4.0,61,False,1,0,1
1,50,False,False,True,False,False,False,False,False,False,...,False,0.9,135,42,3.5,61,False,1,1,0
2,78,True,False,True,False,False,True,False,False,False,...,False,0.7,96,32,4.0,61,False,1,1,0
3,31,True,True,False,False,False,True,False,False,False,...,False,0.7,46,52,4.0,80,False,1,1,0
4,34,True,False,False,False,False,True,False,False,False,...,False,1.0,85,200,4.0,61,False,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,True,False,True,True,True,True,False,False,True,...,True,7.6,85,242,3.3,50,True,0,1,0
151,44,True,False,True,False,False,True,True,False,False,...,False,0.9,126,142,4.3,61,True,1,1,0
152,61,False,False,True,True,False,False,True,False,True,...,False,0.8,75,20,4.1,61,True,1,1,0
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,0,1


In [21]:
## Find outliers

isoforest = sklearn.ensemble.IsolationForest(n_estimators=1000, contamination=0.01, random_state=0)
res = isoforest.fit_predict(df.to_numpy())
res
# Elements with -1 are outliers

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1])

In [22]:
## Find outliers
df[res==-1]

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
37,20,False,False,True,True,True,False,True,True,True,...,False,2.3,150,68,3.9,61,False,1,0,1
125,34,True,False,True,True,True,False,True,False,True,...,False,0.7,70,24,4.1,100,True,1,0,1


In [23]:
## PCA
pca = sklearn.decomposition.PCA(n_components=0.9999)

x_pca = pca.fit_transform(df)

x_ori = pca.inverse_transform(x_pca)

anomaly_score = np.abs(df.to_numpy() - x_ori).sum(1)

anomaly_score.shape

# Get last quantile
threshold = np.quantile(anomaly_score, 0.99)

# Find outliers according to PCA
anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze()
df.iloc[anomalous_ids]

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,...,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,class,sex_female,sex_male
6,51,False,False,True,False,True,True,False,True,True,...,False,0.95,85,58,4.133333,61,False,0,1,0
153,53,False,False,True,False,False,True,False,True,True,...,True,1.5,81,19,4.1,48,True,1,0,1


# Data normalization -> it's missing the first few lines!

In [24]:
# x = df[list(set(df.columns)-set(['class']))]
x = df.drop('class', axis = 1)
y = df['class']
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit_transform(x)
x

Unnamed: 0,age,steroid,antivirals,fatigue,malaise,anorexia,liver_big,liver_firm,spleen_palpable,spiders,ascites,varices,bilirubin,alk_phosphate,sgot,albumin,protime,histology,sex_female,sex_male
0,30,False,False,False,False,False,False,False,False,False,False,False,1.0,85,18,4.0,61,False,0,1
1,50,False,False,True,False,False,False,False,False,False,False,False,0.9,135,42,3.5,61,False,1,0
2,78,True,False,True,False,False,True,False,False,False,False,False,0.7,96,32,4.0,61,False,1,0
3,31,True,True,False,False,False,True,False,False,False,False,False,0.7,46,52,4.0,80,False,1,0
4,34,True,False,False,False,False,True,False,False,False,False,False,1.0,85,200,4.0,61,False,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,46,True,False,True,True,True,True,False,False,True,True,True,7.6,85,242,3.3,50,True,1,0
151,44,True,False,True,False,False,True,True,False,False,False,False,0.9,126,142,4.3,61,True,1,0
152,61,False,False,True,True,False,False,True,False,True,False,False,0.8,75,20,4.1,61,True,1,0
153,53,False,False,True,False,False,True,False,True,True,False,True,1.5,81,19,4.1,48,True,0,1


In [25]:
# Split data in training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y.shape)

(124, 20)
(31, 20)
(155,)


In [26]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

In [27]:
y_estim = rf.predict(x_test)
print(y_estim)

[1 1 1 0 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1]


In [28]:
# Evaluate
confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_estim)
confusion_matrix

In [30]:
acc = sklearn.metrics.accuracy_score(y_test, y_estim)
acc

# Titanic dataset

Drop 'Name', 'Ticket' and 'Cabin'

embarked:	Port of Embarkation
* C = Cherbourg
* Q = Queenstown
* S = Southampton

Encoding, normalizzazione e classificazione

In [100]:
df = pd.read_csv('data/titanic.csv').drop(['Name', 'Ticket', 'Cabin'], axis = 1)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [101]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Embarked         2
dtype: int64

In [102]:
df.isna().sum() / len(df) * 100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Fare            0.000000
Embarked        0.224467
dtype: float64

In [103]:
df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Sex             object
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Embarked        object
dtype: object

In [104]:
df = df.convert_dtypes()
df.dtypes

PassengerId      Int64
Survived         Int64
Pclass           Int64
Sex             string
Age            Float64
SibSp            Int64
Parch            Int64
Fare           Float64
Embarked        string
dtype: object

In [105]:
df['Embarked'].unique()

<StringArray>
['S', 'C', 'Q', <NA>]
Length: 4, dtype: string

In [106]:
median_age = df['Age'].median(skipna=True)

df['Age'] = df['Age'].astype(float)
# df['Age'] = df['Age'].interpolate(method='linear')
df['Age'] = df['Age'].fillna(median_age)

In [107]:
most_freq_port = df['Embarked'].mode().iloc[0]
most_freq_port
df['Embarked'] = df['Embarked'].fillna(most_freq_port)

In [108]:
df['Embarked'].unique()

<StringArray>
['S', 'C', 'Q']
Length: 3, dtype: string

In [109]:
## Categorical

categorical_features = df.select_dtypes(include='string').columns.tolist()
print(categorical_features)

# One-hot encoding
df = pd.get_dummies(df, prefix=categorical_features)
df

['Sex', 'Embarked']


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.0,0,0,13.0,0,1,0,0,1
887,888,1,1,19.0,0,0,30.0,1,0,0,0,1
888,889,0,3,28.0,1,2,23.45,1,0,0,0,1
889,890,1,1,26.0,0,0,30.0,0,1,1,0,0


In [110]:
# Check that all NaN are converted
df.isnull().values.any()

False

## Dealing with outliers

In [111]:
isoforest = sklearn.ensemble.IsolationForest(n_estimators=1000, contamination=0.01, random_state=0)
res = isoforest.fit_predict(df.to_numpy())
df[res==-1]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
16,17,0,3,2.0,4,1,29.125,0,1,0,1,0
27,28,0,1,19.0,3,2,263.0,0,1,0,0,1
88,89,1,1,23.0,3,2,263.0,1,0,0,0,1
311,312,1,1,18.0,2,2,262.375,1,0,1,0,0
438,439,0,1,64.0,1,4,263.0,0,1,0,0,1
679,680,1,1,36.0,0,1,512.3292,0,1,1,0,0
742,743,1,1,21.0,2,2,262.375,1,0,1,0,0
787,788,0,3,8.0,4,1,29.125,0,1,0,1,0
885,886,0,3,39.0,0,5,29.125,1,0,0,1,0


In [112]:
## PCA
pca = sklearn.decomposition.PCA(n_components=0.9999)

x_pca = pca.fit_transform(df)

x_ori = pca.inverse_transform(x_pca)

anomaly_score = np.abs(df.to_numpy() - x_ori).sum(1)

anomaly_score.shape

# Get last quantile
threshold = np.quantile(anomaly_score, 0.99)

# Find outliers according to PCA
anomalous_ids = np.argwhere(anomaly_score > threshold).squeeze()
df.iloc[anomalous_ids]

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
159,160,0,3,28.0,8,2,69.55,0,1,0,0,1
180,181,0,3,28.0,8,2,69.55,1,0,0,0,1
201,202,0,3,28.0,8,2,69.55,0,1,0,0,1
324,325,0,3,28.0,8,2,69.55,0,1,0,0,1
679,680,1,1,36.0,0,1,512.3292,0,1,1,0,0
737,738,1,1,35.0,0,0,512.3292,0,1,1,0,0
792,793,0,3,28.0,8,2,69.55,1,0,0,0,1
846,847,0,3,28.0,8,2,69.55,0,1,0,0,1
863,864,0,3,28.0,8,2,69.55,1,0,0,0,1


In [113]:
# Remove outliers during training
# df = df.drop(anomalous_ids, axis=0)
x = df.drop('Survived', axis = 1)
y = df['Survived']
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit_transform(x)
x

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,35.0,1,0,53.1,1,0,0,0,1
4,5,3,35.0,0,0,8.05,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,27.0,0,0,13.0,0,1,0,0,1
887,888,1,19.0,0,0,30.0,1,0,0,0,1
888,889,3,28.0,1,2,23.45,1,0,0,0,1
889,890,1,26.0,0,0,30.0,0,1,1,0,0


In [114]:
# Split data in training and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
print(x_train.shape)
print(x_test.shape)
print(y.shape)

(712, 11)
(179, 11)
(891,)


In [115]:
rf = sklearn.ensemble.RandomForestClassifier(n_estimators=100)
rf.fit(x_train, y_train)

In [116]:
y_estim = rf.predict(x_test)
print(y_estim)

[0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0.
 1. 1. 0. 1. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1.
 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 1. 0. 1.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1.
 1. 1. 0. 1. 1. 0. 0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0.]


In [117]:
# Evaluate
confusion_matrix = sklearn.metrics.confusion_matrix(y_test, y_estim)
confusion_matrix

array([[102,   8],
       [ 19,  50]])

In [118]:
acc = sklearn.metrics.accuracy_score(y_test, y_estim)
acc

0.8491620111731844