In [1]:
!pip install scikit-learn
!pip install tpot



In [7]:
from sklearn.preprocessing import LabelEncoder  
from tpot import TPOTClassifier
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(train_data.columns)
print(train_data.head())

# print which columns have missing data
print(100*train_data.isnull().sum()/train_data.count())

# remove cabin, name: useless
train_data = train_data.drop(['Cabin', 'Name'],axis = 1)

# fill missing age data with mean value
mean_age = train_data['Age'].mean()
train_data['Age'].fillna(mean_age, inplace=True)

# remove ticket information: cant use this text information
train_data = train_data.drop(['Ticket'],axis = 1)

y = train_data['Survived'] 
X = train_data.drop('Survived',axis =1)

le = LabelEncoder()
for column in ['Sex', 'Embarked']:
    X[column] = le.fit_transform(train_data[column])


Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0

In [4]:
print(X.head())

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


   PassengerId  Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0            1       3    1  22.0      1      0   7.2500         2
1            2       1    0  38.0      1      0  71.2833         0
2            3       3    0  26.0      0      0   7.9250         2
3            4       1    0  35.0      1      0  53.1000         2
4            5       3    1  35.0      0      0   8.0500         2


In [19]:
import time
start_time = time.time()
tpot_classifier = TPOTClassifier(generations=10, population_size=100, random_state=42, verbosity=2, scoring='accuracy', cv=5)
tpot_classifier.fit(X_train, y_train)

# predict on test
y_pred = tpot_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("--- %s seconds ---" % (time.time() - start_time))

                                                                               
Generation 1 - Current best internal CV score: 0.8286023835319611
                                                                               
Generation 2 - Current best internal CV score: 0.8286023835319611
                                                                               
Generation 3 - Current best internal CV score: 0.8286023835319611
                                                                               
Generation 4 - Current best internal CV score: 0.8286023835319611
                                                                               
Generation 5 - Current best internal CV score: 0.8300206835418104
                                                                               
Generation 6 - Current best internal CV score: 0.8300206835418104
                                                                               
Generation 7 - Current best internal CV scor

In [18]:
random_forest_classifier = RandomForestClassifier(n_estimators=37, max_depth=10, random_state=42)
random_forest_classifier.fit(X_train, y_train)

# pred on ptest
y_pred = random_forest_classifier.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8212290502793296
F1 Score: 0.7714285714285715
Confusion Matrix:
[[93 12]
 [20 54]]
