In [135]:
# Import Libraries 
import os
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [136]:
# read in data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [137]:
# drop columns that are not needed
train.drop(['PassengerId'], axis=1, inplace=True)
testPrpId = test['PassengerId']
test.drop(['PassengerId'], axis=1, inplace=True)

In [138]:
# split into X and y
y = train['Transported']
train.drop(['Transported'], axis=1, inplace=True)

labelencoder = LabelEncoder()
labelencoder.fit(y)
y = labelencoder.transform(y)


In [139]:
# Drop columns that are not needed
train.drop(['Name'], axis=1, inplace=True)
test.drop(['Name'], axis=1, inplace=True)

In [140]:
# Impute missing values of Cabin with 0
train.Cabin.fillna(0, inplace=True)
test.Cabin.fillna(0, inplace=True)

# Separate the cabin number into 3 coloumns
train['Cabin_Type'] = train.Cabin.str[0]
train['Cabin_Number'] = train.Cabin.str[2]
train['Cabin_Side'] = train.Cabin.str[4]
train.drop(['Cabin'], axis=1, inplace=True)

test['Cabin_Type'] = test.Cabin.str[0]
test['Cabin_Number'] = test.Cabin.str[2]
test['Cabin_Side'] = test.Cabin.str[4]
test.drop(['Cabin'], axis=1, inplace=True)

# impute missing values with 0
train['Cabin_Type'] = train['Cabin_Type'].fillna('0')
train['Cabin_Number'] = train['Cabin_Number'].fillna('0')
train['Cabin_Side'] = train['Cabin_Side'].fillna('0')

# Replace / with 0
train['Cabin_Side'] = train['Cabin_Side'].str.replace('/', '0')
test['Cabin_Side'] = test['Cabin_Side'].str.replace('/', '0')

test['Cabin_Type'] = test['Cabin_Type'].fillna('0')
test['Cabin_Number'] = test['Cabin_Number'].fillna('0')
test['Cabin_Side'] = test['Cabin_Side'].fillna('0')

In [141]:
# Converting string columns to numeric
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Type','Cabin_Number', 'Cabin_Side']
labelencoder = LabelEncoder()

for feature in categorical_features:
    labelencoder.fit(train[feature])
    train[feature] = labelencoder.transform(train[feature])
    test[feature] = labelencoder.transform(test[feature])

    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(train[feature].mean(), inplace = True)

# impute missing values
numerical_features = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for feature in numerical_features:
    train[feature].fillna(train[feature].mean(), inplace = True)
    test[feature].fillna(test[feature].mean(), inplace = True)

In [142]:
# pipeline for categorical variables
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'Cabin_Type','Cabin_Number', 'Cabin_Side']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
numerical_features = ['Age','RoomService','FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)


In [143]:
# Use the full pipeline to transform both training and test sets
# preprocessor.fit(train)
# train = preprocessor.transform(train)
# test = preprocessor.transform(test)

In [130]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(with_mean=False)
train = sc.fit_transform(train)
test = sc.transform(test)


In [145]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=42)

In [146]:
# Define models
models = [
    LogisticRegression(),
    SVC(),
    KNeighborsClassifier(n_neighbors = 3),
    GaussianNB(),
    Perceptron(),
    LinearSVC(),
    SGDClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(n_estimators=100)
]

In [None]:
# Evaluate each model in turn
for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Model: %s" % model.__class__.__name__)
    print("R-squared: %f" % r2_score(y_test, y_pred))
    print("\n")

In [142]:
# Final model
final_model = RandomForestRegressor()

In [143]:
# make final predictions
final_model.fit(train, y)
final_predictions = final_model.predict(test)

In [144]:
# save results to file
results = pd.DataFrame({'Property_ID': testPrpId, 'Habitability_score': final_predictions})

filename = "submission.csv"

if os.path.exists(filename):
  os.remove(filename)
results.to_csv(filename, index=False,header=True, mode='w')