In [262]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [263]:
df = pd.read_csv("https://raw.githubusercontent.com/ajiteshshree/Spaceship-Titanic/main/spaceship-titanic/train.csv")
df_test = pd.read_csv("https://raw.githubusercontent.com/ajiteshshree/Spaceship-Titanic/main/spaceship-titanic/test.csv")

In [264]:
def slice_ids(df):
    original_pid = df['PassengerId'].tolist()
    df['PassengerId'] = [int(x[0:4]) for x in original_pid]
    df['PersonNo'] = [int(x[5:]) for x in original_pid]
    return df

In [265]:
df = slice_ids(df)
df_test = slice_ids(df_test)

In [266]:
df.drop('Name',axis=1,inplace=True)
df_test.drop('Name',axis=1,inplace=True)

In [267]:
print("df.shape : ", df.shape)
print("df_test.shape : ", df_test.shape)

df.shape :  (8693, 14)
df_test.shape :  (4277, 13)


In [268]:
#filling NaN values
def preprocess(df):
    li = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck', 'Age']
    for col in li:
        df[col].fillna(df[col].mean(),inplace=True)
    df["HomePlanet"].fillna(df["HomePlanet"].mode()[0],inplace=True)
    df["CryoSleep"].fillna(df["CryoSleep"].mode()[0],inplace=True)
    df["Destination"].fillna(df["Destination"].mode()[0],inplace=True)
    df["VIP"].fillna(df["VIP"].mode()[0],inplace=True)
    df["Cabin"].fillna("",inplace=True)

In [269]:
preprocess(df)
preprocess(df_test)

In [270]:
print("Number of nulls in cabin : ",df['Cabin'].isnull().sum())
df['Cabin'].mode()

Number of nulls in cabin :  0


0    
Name: Cabin, dtype: object

In [271]:
y = np.array(df["Transported"])
df.drop(['Transported'], axis=1, inplace=True)

In [272]:
print([x for x in list(df.isnull().sum()) if x != 0], end=",")
print([x for x in list(df_test.isnull().sum()) if x != 0])

[],[]


In [273]:
def slice_cabin(df):
    b=0
    cabin_deck =[]
    cabin_side = []
    while b < df.shape[0]:
        if df.iloc[b,3] != "":
            cabin_deck.append(df.iloc[b,3][0])
            cabin_side.append(df.iloc[b,3][-1])
        else:
            cabin_deck.append("")
            cabin_side.append("")
        b += 1
    df['cabin_deck'] = cabin_deck
    df['cabin_side'] = cabin_side
    df.drop('Cabin', axis=1, inplace=True)

In [274]:
slice_cabin(df)
slice_cabin(df_test)

In [275]:
df_test.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,PersonNo,cabin_deck,cabin_side
0,13,Earth,True,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,1,G,S
1,18,Earth,False,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,1,F,S
2,19,Europa,True,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,1,C,S
3,21,Europa,False,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,1,C,S
4,23,Earth,False,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,1,F,S


In [276]:
# cols = df.columns
# cols2 = df_test.columns
# for x in cols:
#         print(x,end = ' : ')
#         print(df[x].unique())
# print()
# for t in cols2:
#     print(t,end = ' : ')
#     print(df_test[t].unique())       

In [277]:
non_int_cols = [k for k, v in dict(df.dtypes).items() if v == 'O' or v == 'bool']

In [278]:
non_int_cols

['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'cabin_deck', 'cabin_side']

In [283]:
def encode(df):
    le = LabelEncoder()    
    for x in non_int_cols:
        df[x] = le.fit_transform(df[x])
    df = pd.get_dummies(df, columns=df.columns)
    return df

In [284]:
df = encode(df)
df_test = encode(df_test)

In [285]:
df.shape

(8693, 12861)

In [287]:
df_test.shape

(4277, 7266)

In [314]:
df, df_test = df.align(df_test, join='left', axis=1, fill_value=0)

In [318]:
df_test

Unnamed: 0,PassengerId_1,PassengerId_2,PassengerId_3,PassengerId_4,PassengerId_5,PassengerId_6,PassengerId_7,PassengerId_8,PassengerId_9,PassengerId_10,...,cabin_deck_2,cabin_deck_3,cabin_deck_4,cabin_deck_5,cabin_deck_6,cabin_deck_7,cabin_deck_8,cabin_side_0,cabin_side_1,cabin_side_2
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,0,0,1
4273,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
4274,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0
4275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,1,0


In [323]:
logisticRegr = LogisticRegression(max_iter=10000)
logisticRegr.fit(df,y)
print(logisticRegr.score(df, y))

0.8956631772690671


In [324]:
res = logisticRegr.predict(df_test).tolist()

In [325]:
predictions = pd.DataFrame({'PassengerId' : original_pid2,'Transported':res})

In [326]:
predictions.set_index('PassengerId')

Unnamed: 0_level_0,Transported
PassengerId,Unnamed: 1_level_1
0013_01,True
0018_01,False
0019_01,True
0021_01,False
0023_01,True
...,...
9266_02,True
9269_01,False
9271_01,True
9273_01,True


In [327]:
predictions.to_csv('predictions.csv',index = False)