In [8]:
import pandas as pd

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

# y = dataset_df['Transported']
# X = dataset_df.drop(['Transported', 'HomePlanet', 'Destination', 'Deck'], axis=1)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train.head(5)

In [9]:
# Load a dataset into a Pandas Dataframe
df = pd.read_csv('datasets/train.csv')
print("Full train dataset shape is {}".format(df.shape))

Full train dataset shape is (8693, 14)


As the output shows, we have 14 columns. However, not all of them are important for the output. For example, having the "Name" of Passenger or the "PassengerID" are not important for this classification problem. So we drop them.

In [10]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [11]:
df = df.drop(['Name', 'PassengerId'], axis=1)  # The axis 1 refers to the columns
print(df.shape)
df.head()

(8693, 12)


Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True


In Pandas, both isna() and isnull() are methods that are used to detect missing or NaN (Not a Number) values in a DataFrame or Series. They are essentially aliases of each other and perform the same function. The choice between isna() and isnull() is purely a matter of preference; both methods are interchangeable and produce the same result.

In [17]:
print("The number of Null values for each column is as follow:\n", df.isna().sum().sort_values(ascending=False))
print("\nThe total number of rows that contains at least one null value is: ", df.isna().any(axis=1).sum())

The number of Null values for each column is as follow:
 CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
Transported       0
dtype: int64

The total number of rows that contains at least one null value is:  1929


As the number of null value is considerable we need to take care of them in our preprocessing step. To do so, first let's take a look at the types of the values of columns. For the traing step we need to have only numerical values. 

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Cabin         8494 non-null   object 
 3   Destination   8511 non-null   object 
 4   Age           8514 non-null   float64
 5   VIP           8490 non-null   object 
 6   RoomService   8512 non-null   float64
 7   FoodCourt     8510 non-null   float64
 8   ShoppingMall  8485 non-null   float64
 9   Spa           8510 non-null   float64
 10  VRDeck        8505 non-null   float64
 11  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(5)
memory usage: 755.7+ KB


In [None]:
# Define individual classifiers
log_reg = LogisticRegression()
decision_tree = RandomForestClassifier(n_estimators=100)
svm = SVC(probability=True)
gbm = GradientBoostingClassifier()
naive_bayes = GaussianNB()
knn = KNeighborsClassifier()

# Create a voting classifier with the individual classifiers
ensemble_clf = VotingClassifier(estimators=[
    ('logistic_regression', log_reg),
    ('decision_tree', decision_tree),
    ('svm', svm),
    ('gradient_boosting', gbm),
    ('naive_bayes', naive_bayes),
    ('knn', knn)
], voting='soft')  # 'soft' voting calculates the average of probabilities

# Train the ensemble classifier
ensemble_clf.fit(X_train, y_train)

# Make predictions
y_pred = ensemble_clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Ensemble Classifier Accuracy:", accuracy)