In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 12,6

import seaborn as sns
import warnings
warnings.filterwarnings("ignore")


In [2]:
train = pd.read_csv("../input/spaceship-titanic/train.csv")
test = pd.read_csv("../input/spaceship-titanic/test.csv")

In [3]:
train.head()

In [4]:
train.isna().sum()

In [5]:
train.nunique()

In [6]:
train = train.drop(['Cabin', 'Name'], axis = 1)
test = test.drop(['Cabin', 'Name'], axis = 1)

In [7]:
train.info()

# Exploratory data analysis

In [8]:
train.head(10)

In [9]:
sns.distplot(train['Age'], kde=False, color='green')
sns.set_style('white')
plt.xlabel('Age', fontsize=25, color='black', fontname='monospace')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
plt.show()

In [10]:
sns.countplot(data=train, x="Transported", palette="Set3")
sns.set_style('white')
plt.xlabel('Transported', fontsize=25, color='black', fontname='monospace')
plt.ylabel('Count', fontsize=25, color='black', fontname='monospace')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()
plt.show()

In [11]:
ax = sns.countplot(train['HomePlanet'], palette="Set3")
plt.xlabel('Home Planet', fontsize=25, color='black', fontname='monospace')
plt.ylabel('Count', fontsize=25, color='black', fontname='monospace')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

In [12]:
sns.set(style="whitegrid")
total = float(len(train))
ax = sns.countplot(x="HomePlanet", hue="Transported", data=train, palette="Set3")
plt.legend(fontsize=20, frameon=True, fancybox=True, shadow=True, framealpha=1)
plt.title('Transportation of people from different planets', fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
for p in ax.patches:
    percentage = '{:.1f}%'.format(100 * p.get_height()/total)
    x = p.get_x() + p.get_width()
    y = p.get_height()
    ax.annotate(percentage, (x, y),ha='center')
plt.show()

In [38]:
train_df = train[['CryoSleep', 'Age', 'VIP', 'RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Transported']].corr()
sns.heatmap(train_df,annot=True,cmap='YlGnBu',linewidths=0.2) 
fig=plt.gcf()
fig.set_size_inches(10,8)
plt.title("Correlation between data", fontsize=20)
plt.xlabel('Features', fontsize=25, color='black', fontname='monospace')
plt.ylabel('Features', fontsize=25, color='black', fontname='monospace')
plt.xticks(fontsize=20, rotation=70)
plt.yticks(fontsize=20)
plt.show()

# Data Preprocessing

In [15]:
VIP = test.pop('VIP')

test.insert(1, 'VIP', VIP)

test

In [16]:
VIP = train.pop('VIP')

train.insert(1, 'VIP', VIP)

train

In [17]:
test.nunique()

In [18]:
test.isna().sum()

In [19]:
X = train.iloc[:, :-1].values
y = train.iloc[:, -1:].values
X_t = test.iloc[:].values

# Dealing with numerical missing values

In [20]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 5:])
X[:, 5:] = imputer.transform(X[:,5:])

In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X_t[:, 5:])
X_t[:, 5:] = imputer.transform(X_t[:,5:])

# Dealing with categorical missing values

In [22]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X[:, :5])
X[:, :5] = imputer.transform(X[:,:5])

In [23]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(X_t[:, :5])
X_t[:, :5] = imputer.transform(X_t[:,:5])

# Encoding categorical variables

In [24]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 1] = le.fit_transform(X[:, 1])
X[:, 2] = le.fit_transform(X[:, 2])
X[:, 3] = le.fit_transform(X[:, 3])
X[:, 4] = le.fit_transform(X[:, 4])

In [25]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X_t[:, 1] = le.fit_transform(X_t[:, 1])
X_t[:, 2] = le.fit_transform(X_t[:, 2])
X_t[:, 3] = le.fit_transform(X_t[:, 3])
X_t[:, 4] = le.fit_transform(X_t[:, 4])

# Training dataset

In [26]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X, y)
y_pred = classifier.predict(X_t)

# Submission

In [27]:
submission = test[["PassengerId"]]
submission

In [28]:
submission["Transported"] = y_pred

In [29]:
submission.to_csv("submission.csv", index=None)