In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input/spaceship-titanic/train.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split

In [None]:
train_df = pd.read_csv("/kaggle/input/d/gopikrishnayogan/spaceship-titanic/train.csv")
train_df.head()

In [None]:
train_df.info()

1. **PassengerId** - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
2. **HomePlanet** - The planet the passenger departed from, typically their planet of permanent residence.
3. **CryoSleep** - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
4. **Cabin** - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
5. **Destination** - The planet the passenger will be debarking to.
6. **Age** - The age of the passenger.
7. **VIP** - Whether the passenger has paid for special VIP service during the voyage.
8. **RoomService, FoodCourt, ShoppingMall, Spa, VRDeck** - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
9. **Name** - The first and last names of the passenger.
10. **Transported**- Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

- There are 8693 rows and 13 columns are there
- our Target variable is boolean type
- It seems to be there is some null values in some of the columns

In [None]:
train_df.duplicated().sum()

- There is no Dublicates are present 

In [None]:
train_df.isna().sum()

* There is so many Null values in the features
* Null values can be imputed or dropped

In [None]:
train_df.describe().T

- Most of the Numerical columns are seems to be zero mean Except Age column

In [None]:
train_df.describe(include='object').T

## Observations :-
- PassengerId is unique feature
- HomePlanet having 3 unique values and major are from earth
- CryoSleep - Most of the passengers are not in hybernation or sleep state
- Cabin - seems to be suspicious we may divide this lets check once
- Most of the Passengers are not VIP 
- There are ony three Destinations are there
- It seems to be Name is also unique columns,needed to check

In [None]:
train_df[['cabin_deck','cabin_num','cabin_side']] = train_df['Cabin'].str.split('/',expand=True)

In [None]:
train_df.sample(2)

In [None]:
train_df.describe(include='object').T

- Now its some what clear that cabindeck divided into three columns

In [None]:
train_df.columns

In [None]:
train_df.drop(['PassengerId','Cabin'],axis=1,inplace=True)

In [None]:
cat_feat = [feat for feat in train_df.columns if train_df[feat].nunique() <= 25]
num_feat = [feat for feat in train_df.columns if train_df[feat].nunique() >= 25]
print("Categorical features : \n", cat_feat)
print("Numerical features : \n", num_feat)

In [None]:
train_df['Transported'].value_counts().plot(kind='bar');

- Classes are balanced

In [None]:
transported = train_df['Transported'].astype('str')
print(transported)

In [None]:
train_cat = train_df[cat_feat]
train_cat.head()

In [None]:
[train_cat[x].value_counts() for x in train_cat.columns]

In [None]:
train_cat.isna().mean()

In [None]:
def null_target_relation(catfeat,c='darkblue'):
    relation = train_cat[train_cat[catfeat].isnull()]['Transported'].value_counts()
    relation.plot(kind='bar',color=c)
    plt.title(str(catfeat)+' and Transported')
    return pd.DataFrame(relation)

In [None]:
null_target_relation('HomePlanet')

In [None]:
null_target_relation('CryoSleep',c='green')

In [None]:
null_target_relation('Destination',c='blue')

In [None]:
null_target_relation('VIP',c='brown')

In [None]:
null_target_relation('cabin_deck',c='navy')

In [None]:
null_target_relation('cabin_side',c='orange')

- Null values are equally impact on both categories of our Target feature

In [None]:
train_cat.groupby('HomePlanet')['CryoSleep'].value_counts()

In [None]:
train_cat.groupby('HomePlanet')['Destination'].value_counts()

In [None]:
train_cat.groupby('HomePlanet')['VIP'].value_counts()

* More number of VIP passengers are from Europa and Mars
* No VIP Passenger from Earth 

In [None]:
train_cat.groupby('HomePlanet')['cabin_deck'].value_counts()

In [None]:
train_cat.groupby('HomePlanet')['cabin_side'].value_counts()

In [None]:
train_cat.groupby('CryoSleep')['VIP'].value_counts()

In [None]:
train_num = train_df[num_feat]
train_num.head()

In [None]:
train_df[train_df['Age'].isna()]['Transported'].value_counts()

In [None]:
train_df[train_df['RoomService'].isna()]['Transported'].value_counts()

- Numerical columns are also equally impacts on Target feature

In [None]:
train_df[train_df['Age']==0.0]['Age'].value_counts()

- The Age never be zero or children cannot do this type of travel


In [None]:
train_df['Age'].isna().sum()

In [None]:
train_df['Age'].plot(kind='hist');

In [None]:
sns.boxplot(train_df['Age'])

In [None]:
train_df.head()

In [None]:
ax = plt.figure(figsize=(20,15))
train_num.hist(ax=ax,color='darkblue',grid=False);

In [None]:
vip_roomservice = train_df[train_df['VIP']==False]['RoomService'] #['FoodCourt','ShoppingMall','Spa','VRDeck']]

In [None]:
vip_roomservice.min(),vip_roomservice.max()

In [None]:
vip_roomservice = train_df[train_df['VIP']==True]['RoomService'] #['FoodCourt','ShoppingMall','Spa','VRDeck']]

In [None]:
vip_roomservice.hist()

In [None]:
vip_roomservice.min(),vip_roomservice.max()

- There is some issue with RoomService feature ,The passengers who dont have VIP ,charges are more

In [None]:
vip_fc = train_df[train_df['VIP']==True]['FoodCourt'] #['FoodCourt','ShoppingMall','Spa','VRDeck']]

In [None]:
sns.distplot(vip_fc)
vip_fc.value_counts().head(10)

In [None]:
vip_fc = train_df[train_df['VIP']==False]['FoodCourt'] #['FoodCourt','ShoppingMall','Spa','VRDeck']]
sns.distplot(vip_fc);

In [None]:
vip_fc.value_counts().head()

* its been clear that vip food charges are more for Vip Passngers

In [None]:
train_df.loc[train_df['VIP']==False,['ShoppingMall','Spa']].hist(grid=False,color='magenta') #['FoodCourt','ShoppingMall','Spa','VRDeck']]


In [None]:
train_df.loc[train_df['VIP']==True,['ShoppingMall','Spa']].hist(grid=False,color='green') #['FoodCourt','ShoppingMall','Spa','VRDeck']]


In [None]:
#lets check correlation plot
plt.figure(figsize=(10,10))
sns.heatmap(train_num.drop('Name',axis=1).corr(),annot=True,cbar=False)
plt.xticks(rotation=90)
plt.yticks(rotation=0);

In [None]:
sum(train_num['Name'].value_counts()==2)

In [None]:
train_df1 = train_df.drop('Name',axis=1)
train_df1['Transported'] = train_df1['Transported'].astype('object')
train_df1['Transported'] = train_df1['Transported'].map({
    True : 1,
    False :0
})

In [None]:
train_df1.head()

In [None]:
train_df1.info()

In [None]:
train_df2 = train_df1.select_dtypes([int,float])
train_df2.head()

In [None]:
sns.pairplot(train_df2,hue='Transported')

### Preprocessing

In [None]:
train_df1.shape

In [None]:
train_df_001 = train_df1.copy()

* Dropping Null values

In [None]:
train_df1.dropna(inplace = True)

In [None]:
train_df_fin= train_df1.reset_index()

In [None]:
X = train_df_fin.drop('Transported',axis=1)
y = train_df_fin['Transported']

In [None]:
X.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2)

In [None]:
X_train.shape,y_train.shape

In [None]:
X_train.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for clmn in X_train.columns:
    X_train[clmn] = le.fit_transform(X_train[clmn])
    X_test[clmn]  = le.fit_transform(X_test[clmn])

In [None]:
X_test

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train,y_train)

In [None]:
rf.score(X_test,y_test)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

### Impute null values