In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
train_dataset = pd.read_csv('./train.csv')
test_dataset = pd.read_csv('./test.csv')

In [3]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [4]:
train_dataset.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
test_dataset.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


In [79]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming the dataset is loaded as train_dataset
# train_dataset = pd.read_csv('train_dataset.csv')

# Separate features and target
X = train_dataset.drop(['Transported', 'Name'], axis=1)
y = train_dataset['Transported']

# Define feature groups
numerical_features = ['PassengerId', 'Age', 'RoomService',
                      'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Handle 'Cabin' feature
X['Deck'] = X['Cabin'].str[0]
X['Side'] = X['Cabin'].str[-1]
categorical_features.extend(['Deck', 'Side'])
X.drop('Cabin', axis=1, inplace=True)

# Convert all categorical features to string type
for feature in categorical_features:
    X[feature] = X[feature].astype(str)

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Fit and transform the data
X_encoded = preprocessor.fit_transform(X)

# Get feature names after encoding
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features)
feature_names = numerical_features + list(cat_feature_names)

# Convert to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=feature_names, index=X.index)

# Convert target to numerical
y = y.map({True: 1, False: 0})

In [92]:
X_encoded.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_nan,Side_P,Side_S,Side_nan
0,1301.0,27.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1801.0,19.0,0.0,9.0,0.0,2823.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1901.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2101.0,38.0,0.0,6652.0,0.0,181.0,585.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2301.0,20.0,10.0,0.0,635.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [23]:
X_encoded.columns

Index(['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars',
       'HomePlanet_nan', 'CryoSleep_False', 'CryoSleep_True', 'CryoSleep_nan',
       'Destination_55 Cancri e', 'Destination_PSO J318.5-22',
       'Destination_TRAPPIST-1e', 'Destination_nan', 'VIP_False', 'VIP_True',
       'VIP_nan', 'Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F',
       'Deck_G', 'Deck_T', 'Deck_nan', 'Side_P', 'Side_S', 'Side_nan'],
      dtype='object')

In [21]:
y.head()

0    0
1    1
2    0
3    0
4    1
Name: Transported, dtype: int64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

### Selecting the model

In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_clf = DecisionTreeClassifier(random_state=42)

grid_search = GridSearchCV(estimator=dt_clf, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

In [35]:
grid_search.best_estimator_

In [37]:
accuracy_dict = {}

scores = cross_val_score(grid_search, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())
accuracy_dict['DecisionTreeClassifier'] = scores.mean()

Accuracy: 0.7782579687508081


In [38]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_train, y_train)

In [39]:
grid_search.best_estimator_

In [50]:
grid_search.best_score_

0.8074504652209218

In [42]:
scores = cross_val_score(grid_search, X_train, y_train, cv=2)
print("Accuracy:", scores.mean())
accuracy_dict['RandomForestClassifier'] = scores.mean()

KeyboardInterrupt: 

In [45]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

logisticRegression = make_pipeline(
    StandardScaler(),
    LogisticRegression(multi_class='multinomial', solver='lbfgs')
)

logisticRegression.fit(X_train, y_train)

In [46]:
scores = cross_val_score(logisticRegression, X_train, y_train, cv=5)
print("Accuracy:", scores.mean())
accuracy_dict['LogisticRegression'] = scores.mean()

Accuracy: 0.7923521714619677


In [51]:
from sklearn.svm import SVC
from scipy.stats import loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV

param_distrib = {
    "svc__gamma": loguniform(0.001, 0.1),
    "svc__C": uniform(1, 10)
}

svm_clf = make_pipeline(
    StandardScaler(),
    SVC(random_state=42)
)

random_search = RandomizedSearchCV(
    svm_clf,
    param_distrib,
    n_iter=50,  # Number of parameter settings to sample
    cv=5,       # Number of cross-validation folds
    random_state=42
)

random_search.fit(X_train, y_train)
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'svc__C': 7.842330265121569, 'svc__gamma': 0.007591104805282696}
Best Score: 0.8045751464967493


In [80]:
from sklearn.ensemble import RandomForestClassifier

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_clf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf_clf, param_grid=param_grid, cv=5, scoring='accuracy')

grid_search.fit(X_test, y_test)

In [81]:
grid_search.best_estimator_

In [55]:
from sklearn.svm import SVC
from scipy.stats import loguniform, uniform
from sklearn.model_selection import RandomizedSearchCV

param_distrib = {
    "svc__gamma": loguniform(0.001, 0.1),
    "svc__C": uniform(1, 10)
}

svm_clf = make_pipeline(
    StandardScaler(),
    SVC(random_state=42)
)

random_search = RandomizedSearchCV(
    svm_clf,
    param_distrib,
    n_iter=50,  # Number of parameter settings to sample
    cv=5,       # Number of cross-validation folds
    random_state=42
)

random_search.fit(X_test, y_test)
print("Best Parameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Best Parameters: {'svc__C': 8.31993941811405, 'svc__gamma': 0.015751320499779727}
Best Score: 0.7728642883169367


In [97]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Assuming the dataset is loaded as train_dataset
# train_dataset = pd.read_csv('train_dataset.csv')

# Separate features and target
X = test_dataset.drop(['Name'], axis=1)

# Define feature groups
numerical_features = ['PassengerId', 'Age', 'RoomService',
                      'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
categorical_features = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

# Handle 'Cabin' feature
X['Deck'] = X['Cabin'].str[0]
X['Side'] = X['Cabin'].str[-1]
categorical_features.extend(['Deck', 'Side'])
X.drop('Cabin', axis=1, inplace=True)

# Convert all categorical features to string type
for feature in categorical_features:
    X[feature] = X[feature].astype(str)

# Create preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])


# Extract PassengerId before dropping it
passenger_ids = X['PassengerId']

# Fit and transform the data
X_encoded = preprocessor.fit_transform(X)

# Get feature names after encoding
onehot_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
cat_feature_names = onehot_encoder.get_feature_names_out(categorical_features)
feature_names = numerical_features + list(cat_feature_names)

# Convert to DataFrame
X_encoded = pd.DataFrame(X_encoded, columns=feature_names, index=X.index)

In [88]:
X_encoded.head()

Unnamed: 0,PassengerId,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,...,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,Deck_nan,Side_P,Side_S,Side_nan
0,1301.0,27.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1801.0,19.0,0.0,9.0,0.0,2823.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1901.0,31.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,2101.0,38.0,0.0,6652.0,0.0,181.0,585.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,2301.0,20.0,10.0,0.0,635.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [89]:
rf_clf_best = RandomForestClassifier(min_samples_leaf=4, min_samples_split=10,
                                     n_estimators=50, random_state=42)

rf_clf_best.fit(X_train, y_train)

In [90]:
predict = rf_clf_best.predict(X_encoded)
boolean_predictions = predict == 1

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- PassengerId


In [91]:
boolean_predictions

array([ True, False,  True, ...,  True,  True,  True])

In [98]:
output_df = pd.DataFrame({
    'PassengerId': passenger_ids,
    'Transported': boolean_predictions
})

output_file_path = 'prediction.csv'
output_df.to_csv(output_file_path, index=False)