In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import matplotlib.pyplot as pyplot
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:

train_data = pd.read_csv(r'..\csv_data\train.csv')
test_data = pd.read_csv(r'..\csv_data\test.csv')

# train_data.fillna(0)

# train_data[['Cabin']].replace('NAN', '0', inplace=True)

train_data['Embarked'] = train_data['Embarked'].replace(np.nan, "0")

print("Dropping unique traits: ==>\n")
train_data = train_data.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis=1)


encoder = preprocessing.LabelEncoder()

sex = (encoder.fit_transform(np.ravel(train_data[['Sex']])))
embarked = preprocessing.scale(encoder.fit_transform(np.ravel(train_data[['Embarked']])))
fare = preprocessing.scale(np.ravel(train_data[['Fare']]))
train_data['Sex'] = sex
train_data['Embarked'] = embarked
train_data['Fare'] = fare
train_data['Pclass'] = preprocessing.scale(np.ravel(train_data[['Pclass']]))

print(train_data.head())

Dropping unique traits: ==>

   Survived    Pclass  Sex   Age  SibSp  Parch      Fare  Embarked
0         0  0.827377    1  22.0      1      0 -0.502445  0.587966
1         1 -1.566107    0  38.0      1      0  0.786845 -1.912644
2         1  0.827377    0  26.0      0      0 -0.488854  0.587966
3         1 -1.566107    0  35.0      1      0  0.420730  0.587966
4         0  0.827377    1  35.0      0      0 -0.486337  0.587966


In [3]:
print("Correlation")
for i in train_data.columns:
    print(f"Survived vs {i}: ", pd.Series.corr(train_data['Survived'], train_data[i]))


train_data = train_data[['Survived', 'Pclass', 'Sex', 'Fare', 'Embarked']]

print("New ==> \n")

print(train_data.head())

predict = "Survived"
X = train_data.drop([predict], axis=1)
y = train_data[[predict]]

print(X.head())
print(y.head())

Correlation
Survived vs Survived:  1.0
Survived vs Pclass:  -0.3384810359610146
Survived vs Sex:  -0.543351380657755
Survived vs Age:  -0.07722109457217764
Survived vs SibSp:  -0.03532249888573559
Survived vs Parch:  0.08162940708348365
Survived vs Fare:  0.2573065223849623
Survived vs Embarked:  -0.1765092251688823
New ==> 

   Survived    Pclass  Sex      Fare  Embarked
0         0  0.827377    1 -0.502445  0.587966
1         1 -1.566107    0  0.786845 -1.912644
2         1  0.827377    0 -0.488854  0.587966
3         1 -1.566107    0  0.420730  0.587966
4         0  0.827377    1 -0.486337  0.587966
     Pclass  Sex      Fare  Embarked
0  0.827377    1 -0.502445  0.587966
1 -1.566107    0  0.786845 -1.912644
2  0.827377    0 -0.488854  0.587966
3 -1.566107    0  0.420730  0.587966
4  0.827377    1 -0.486337  0.587966
   Survived
0         0
1         1
2         1
3         1
4         0


In [4]:

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.15)

model = RandomForestClassifier(bootstrap=True)

model.fit(train_X, np.ravel(train_y))

predictions = model.predict(test_X)

# for index, prediction in enumerate(predictions):
#     print(prediction, np.ravel(train_y)[index])

print("Accuracy score:", model.score(test_X, test_y))


Accuracy score: 0.7985074626865671


In [5]:
test_data['Embarked'] = test_data['Embarked'].replace(np.nan, "0")
test_data['Pclass'] = test_data['Pclass'].replace(np.nan, "0")
test_data['Sex'] = test_data['Sex'].replace(np.nan, "0")
test_data['Fare'] = test_data['Fare'].replace(np.nan, "0")

print("Dropping unique traits: ==>\n")

test_data = test_data.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis=1)


encoder = preprocessing.LabelEncoder()

sex = (encoder.fit_transform(np.ravel(test_data[['Sex']])))
embarked = preprocessing.scale(encoder.fit_transform(np.ravel(test_data[['Embarked']])))
fare = preprocessing.scale(np.ravel(test_data[['Fare']]))
test_data['Sex'] = sex
test_data['Embarked'] = embarked
test_data['Fare'] = fare
test_data['Pclass'] = preprocessing.scale(np.ravel(test_data[['Pclass']]))
test_data = test_data[['Pclass', 'Sex', 'Fare', 'Embarked']]

print(test_data)

Dropping unique traits: ==>

       Pclass  Sex      Fare  Embarked
0    0.873482    1 -0.496637 -0.470915
1    0.873482    0 -0.511497  0.700767
2   -0.315819    1 -0.463335 -0.470915
3    0.873482    1 -0.481704  0.700767
4    0.873482    0 -0.416740  0.700767
..        ...  ...       ...       ...
413  0.873482    1 -0.492680  0.700767
414 -1.505120    0  1.314641 -1.642598
415  0.873482    1 -0.507017  0.700767
416  0.873482    1 -0.492680  0.700767
417  0.873482    1 -0.236263 -1.642598

[418 rows x 4 columns]


In [7]:
submission_prediction = model.predict(test_data)

submission = pd.DataFrame({
    "PassengerId": list(range(892, 1310)),
    "Survived": submission_prediction
})
print(len(submission))

submission.to_csv(r'..\submission1.csv', index=False)


418
