<a href="https://colab.research.google.com/github/benvictoria21/pandas/blob/master/break_the_ice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install pywaffle



In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
import plotly.express as px
import plotly.graph_objects as go
import sklearn.metrics as metrics

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_curve,auc, confusion_matrix
from xgboost import XGBClassifier
from pywaffle import Waffle
from yellowbrick.classifier import classification_report

In [9]:
custom_colors = ["#c8e7ff","#deaaff", "#f72585","#d100d1"]
customPalette = sns.set_palette(sns.color_palette(custom_colors))

In [10]:
sns.palplot(sns.color_palette(custom_colors),size=1)
plt.tick_params(axis='both', labelsize=0, length = 0)

In [11]:
sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 2.5})

In [12]:
train_data = pd.read_csv("https://raw.githubusercontent.com/benvictoria21/pandas/master/dataset/Titanic%20-%20Machine%20Learning%20from%20Disaster/train.csv")
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
train_data.shape

(891, 12)

In [14]:
test_data = pd.read_csv("https://raw.githubusercontent.com/benvictoria21/pandas/master/dataset/Titanic%20-%20Machine%20Learning%20from%20Disaster/test.csv")
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [15]:
test_data.shape

(418, 11)

In [16]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [17]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [18]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [19]:
train_data.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [21]:
train_data['Cabin'].unique()

array([nan, 'C85', 'C123', 'E46', 'G6', 'C103', 'D56', 'A6',
       'C23 C25 C27', 'B78', 'D33', 'B30', 'C52', 'B28', 'C83', 'F33',
       'F G73', 'E31', 'A5', 'D10 D12', 'D26', 'C110', 'B58 B60', 'E101',
       'F E69', 'D47', 'B86', 'F2', 'C2', 'E33', 'B19', 'A7', 'C49', 'F4',
       'A32', 'B4', 'B80', 'A31', 'D36', 'D15', 'C93', 'C78', 'D35',
       'C87', 'B77', 'E67', 'B94', 'C125', 'C99', 'C118', 'D7', 'A19',
       'B49', 'D', 'C22 C26', 'C106', 'C65', 'E36', 'C54',
       'B57 B59 B63 B66', 'C7', 'E34', 'C32', 'B18', 'C124', 'C91', 'E40',
       'T', 'C128', 'D37', 'B35', 'E50', 'C82', 'B96 B98', 'E10', 'E44',
       'A34', 'C104', 'C111', 'C92', 'E38', 'D21', 'E12', 'E63', 'A14',
       'B37', 'C30', 'D20', 'B79', 'E25', 'D46', 'B73', 'C95', 'B38',
       'B39', 'B22', 'C86', 'C70', 'A16', 'C101', 'C68', 'A10', 'E68',
       'B41', 'A20', 'D19', 'D50', 'D9', 'A23', 'B50', 'A26', 'D48',
       'E58', 'C126', 'B71', 'B51 B53 B55', 'D49', 'B5', 'B20', 'F G63',
       'C62 C64',

In [22]:
train_data['Cabin'] = train_data['Cabin'].apply(lambda i: i[0] if pd.notnull(i) else 'Z')
test_data['Cabin'] = test_data['Cabin'].apply(lambda i: i[0] if pd.notnull(i) else 'Z')

In [23]:
train_data['Cabin'].unique()

array(['Z', 'C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)

In [24]:
train_data[train_data['Cabin']=='T'].index.values

array([339])

In [25]:
test_data[test_data['Cabin']=='T'].index.values

array([], dtype=int64)

In [26]:
train_data.iloc[339]

PassengerId                             340
Survived                                  0
Pclass                                    1
Name           Blackwell, Mr. Stephen Weart
Sex                                    male
Age                                      45
SibSp                                     0
Parch                                     0
Ticket                               113784
Fare                                   35.5
Cabin                                     T
Embarked                                  S
Name: 339, dtype: object

In [27]:
index = train_data[train_data['Cabin'] == 'T'].index
train_data.loc[index, 'Cabin'] = 'A'

In [28]:
def plot_bar(df, feat_x, feat_y,s, normalize=True):
    ct = pd.crosstab(df[feat_x], df[feat_y])
    return ct.plot(kind='bar', stacked=s)

In [29]:
dpi=80
plot_bar(train_data, 'Cabin', 'Pclass',False)
plt.legend(title='Pclass',loc='upper right',bbox_to_anchor=(1.25, 1))
plt.gcf().set_size_inches(10,8)
plt.ylim(0,100)
plt.xticks(rotation=45)
plt.show()

In [30]:
train_data['Cabin'] = train_data['Cabin'].replace(['A', 'B', 'C'], 'ABC')
train_data['Cabin'] = train_data['Cabin'].replace(['D', 'E'], 'DE')
train_data['Cabin'] = train_data['Cabin'].replace(['F', 'G'], 'FG')

test_data['Cabin'] = test_data['Cabin'].replace(['A', 'B', 'C'], 'ABC')
test_data['Cabin'] = test_data['Cabin'].replace(['D', 'E'], 'DE')
test_data['Cabin'] = test_data['Cabin'].replace(['F', 'G'], 'FG')

In [31]:
train_data.drop(["Ticket", "Name", "PassengerId"], axis=1, inplace=True)
test_data.drop(["Ticket", "Name", "PassengerId"], axis=1, inplace=True)

train_data["Age"].fillna(train_data["Age"].median(skipna=True), inplace=True)
test_data["Age"].fillna(test_data["Age"].median(skipna=True), inplace=True)


test_data["Fare"].fillna(test_data["Fare"].median(skipna=True), inplace=True)

train_data["Embarked"].fillna('S', inplace=True)
test_data["Embarked"].fillna('S', inplace=True)

In [32]:
train_data["Cabin"].unique()

array(['Z', 'ABC', 'DE', 'FG'], dtype=object)

In [33]:
gender = {'male': 0, 'female': 1}
train_data.Sex = [gender[item] for item in train_data.Sex] 
test_data.Sex = [gender[item] for item in test_data.Sex] 

embarked = {'S': 0, 'C': 1, 'Q':2}
train_data.Embarked = [embarked[item] for item in train_data.Embarked] 
test_data.Embarked = [embarked[item] for item in test_data.Embarked] 


train_data['Cabin'] = LabelEncoder().fit_transform(train_data['Cabin'])
test_data['Cabin'] = LabelEncoder().fit_transform(test_data['Cabin'])

In [34]:
train_data.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin         int64
Embarked      int64
dtype: object

In [35]:
gender = train_data['Sex'].value_counts()

fig = plt.figure(
    FigureClass=Waffle, 
    rows=5,
    columns=10,
    values=gender,
    colors = (custom_colors[0], custom_colors[1]),
    title={'label': 'Gender Distribution', 'loc': 'center'},
    labels=["{}({})".format(a, b) for a, b in zip(gender.index, gender) ],
    legend={'loc': 'upper left', 'bbox_to_anchor': (1,1)},
    font_size=35, 
    icons = ['male','female'],
    icon_legend=True,
    figsize=(10, 8)
)

In [36]:
def triple_plot(x, title,c):
    fig, ax = plt.subplots(3,1,figsize=(15,8),sharex=True)
    sns.distplot(x, ax=ax[0],color=c)
    ax[0].set(xlabel=None)
    ax[0].set_title('Histogram + KDE')
    sns.boxplot(x, ax=ax[1],color=c)
    ax[1].set(xlabel=None)
    ax[1].set_title('Boxplot')
    sns.violinplot(x, ax=ax[2],color=c)
    ax[2].set(xlabel=None)
    ax[2].set_title('Violin plot')
    fig.suptitle(title, fontsize=16)
    plt.tight_layout(pad=3.0)
    plt.show()

In [37]:
def hist(x,title):
    plt.figure(figsize = (10,8))
    ax = sns.distplot(x, 
                 kde=False);
    values = np.array([rec.get_height() for rec in ax.patches])
    norm = plt.Normalize(values.min(), values.max())
    colors = plt.cm.jet(norm(values))
    for rec, col in zip(ax.patches, colors):
        rec.set_color(col)
    plt.title(title)

In [38]:
hist(train_data['Age'],'Distribution of Age')


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).



In [39]:
triple_plot(train_data['Age'],'Distribution of Age',custom_colors[2])


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.



In [40]:
hist(train_data['Fare'],'Distribution of Fare')


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).



In [41]:
triple_plot(train_data['Fare'],'Distribution of Fare',custom_colors[1])


`distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.



In [42]:
td = pd.read_csv("https://raw.githubusercontent.com/benvictoria21/pandas/master/dataset/Titanic%20-%20Machine%20Learning%20from%20Disaster/train.csv")
td["Cabin"]=td.Cabin.str[0]

In [43]:
sns.catplot("Survived", col="Cabin", col_wrap=8,data=td[td.Cabin.notnull()],kind="count",height=4,aspect=.6)
plt.show()


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.



In [44]:
sns.catplot("Survived", col="Cabin", col_wrap=4,data=train_data,kind="count", height=4,aspect=.6)
plt.show()


Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.



In [45]:
plot_bar(train_data, 'SibSp', 'Survived',False)
plt.legend(title='Survived',loc='upper right',bbox_to_anchor=(1.25, 1))
plt.gcf().set_size_inches(10,8)
plt.xticks(rotation=45)
plt.show()

In [46]:
plot_bar(train_data, 'Parch', 'Survived',True)
plt.legend(title='Survived',loc='upper right',bbox_to_anchor=(1.25, 1))
plt.gcf().set_size_inches(10,8)
plt.xticks(rotation=45)
plt.show()

In [47]:
data = train_data[['Sex','Survived']]
data1 = data.loc[data.Sex==0]
data2 = data.loc[data.Sex!=0]

plt.figure(figsize=(16,8),dpi=60)

ax1 = plt.subplot(121, aspect='equal')
data1['Survived'].value_counts().plot.pie(startangle=90,autopct='%1.1f%%', ax=ax1)
ax1.title.set_text('Male')

ax2 = plt.subplot(122, aspect='equal')
data2['Survived'].value_counts().plot.pie(startangle=90,autopct='%1.1f%%', ax=ax2)
ax2.title.set_text('Female')

plt.show()

In [48]:
sns.barplot(x = "Embarked", y = "Fare", hue = "Survived", data = train_data)
plt.show()

In [49]:
sns.kdeplot(train_data['Fare'][train_data.Survived == 1], color=custom_colors[2], shade=True)
sns.kdeplot(train_data['Fare'][train_data.Survived == 0], color=custom_colors[1], shade=True)
plt.legend(['Survived', 'Not Survived'])
plt.show()

In [50]:
sns.kdeplot(train_data['Age'][train_data.Survived == 1], color=custom_colors[2], shade=True)
sns.kdeplot(train_data['Age'][train_data.Survived == 0], color=custom_colors[1], shade=True)
plt.legend(['Survived', 'Not Survived'])
plt.show()

In [51]:
mask = np.triu(np.ones_like(train_data.corr(), dtype=bool))
fig, ax = plt.subplots(figsize=(16,10),dpi=80, facecolor='w', edgecolor='k')
sns.heatmap(train_data.corr(), mask=mask, cmap="YlGnBu", vmax=.3, center=0,annot = True,
            square=True)
plt.show()

In [52]:
expected_values = train_data["Survived"]
train_data.drop("Survived", axis=1, inplace=True)

In [53]:
train_data.drop("Cabin", axis=1, inplace=True)
test_data.drop("Cabin", axis=1, inplace=True)

In [54]:
X = train_data.values
y = expected_values.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [55]:
model = RandomForestClassifier(criterion='gini',
                                           n_estimators=1750,
                                           max_depth=7,
                                           min_samples_split=6,
                                           min_samples_leaf=6,
                                           max_features='auto',
                                           oob_score=True,
                                           random_state=42,
                                           n_jobs=-1,
                                           verbose=1) 

In [56]:
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    2.9s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 1750 out of 1750 | elapsed:    0.6s finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent worker

In [57]:
print("Training accuracy: ", accuracy_score(y_train, y_pred_train))
print("Testing accuracy: ", accuracy_score(y_test, y_pred_test))

Training accuracy:  0.8778089887640449
Testing accuracy:  0.8435754189944135


In [58]:
column_values = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] 
X_train
X_train_df = pd.DataFrame(data = X_train,   
                  columns = column_values) 
X_test_df = pd.DataFrame(data = X_test,   
                  columns = column_values) 

In [59]:
importances = model.feature_importances_
indices = np.argsort(importances)
features = X_train_df.columns
plt.title('Feature Importance')
plt.barh(range(len(indices)), importances[indices], color=custom_colors[2], align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

In [60]:
print("\nConfusion Matrix\n")
cm = confusion_matrix(y_test, y_pred_test)
print(cm)


Confusion Matrix

[[101   9]
 [ 19  50]]


In [61]:
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax, fmt='g',cmap='Blues') #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')

plt.show()

In [62]:
fpr, tpr, _ = roc_curve(y_test, y_pred_test)
roc_auc = auc(fpr, tpr)
print("\nROC AUC on evaluation set",roc_auc )


ROC AUC on evaluation set 0.8214097496706194


In [63]:
plt.figure()
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc,color=custom_colors[0])
plt.plot([0, 1], [0, 1], 'k--',color=custom_colors[2])
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [64]:
model.fit(train_data, expected_values)
print("%.4f" % model.oob_score_)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:    1.4s
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1750 out of 1750 | elapsed:    3.0s finished


0.8294


In [66]:
passenger_IDs = pd.read_csv("https://raw.githubusercontent.com/benvictoria21/pandas/master/dataset/Titanic%20-%20Machine%20Learning%20from%20Disaster/test.csv")[["PassengerId"]].values
preds = model.predict(test_data.values)
preds

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.0s
[Parallel(n_jobs=2)]: Done 196 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 446 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 796 tasks      | elapsed:    0.3s
[Parallel(n_jobs=2)]: Done 1246 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 1750 out of 1750 | elapsed:    0.6s finished


array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [67]:
df = {'PassengerId': passenger_IDs.ravel(), 'Survived': preds}
df_predictions = pd.DataFrame(df).set_index(['PassengerId'])
df_predictions.head(10)

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,0
897,0
898,1
899,0
900,1
901,0


In [71]:
df = pd.read_csv('https://raw.githubusercontent.com/benvictoria21/pandas/master/dataset/Titanic%20-%20Machine%20Learning%20from%20Disaster/gender_submission.csv')

df

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
