In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter

TITANIC_PATH = os.path.join('dataset')

def load_data(filename):
    csv_path = os.path.join(TITANIC_PATH, filename)
    return pd.read_csv(csv_path)

train_data = load_data('train.csv')
test_data = load_data('test.csv')


In [2]:
final_index = test_data['PassengerId']

In [3]:
train_labels = train_data['Transported']
train_features = train_data.copy()


In [4]:
from sklearn.impute import SimpleImputer

def numericalImputer(features):
    num_columns = ['Age', 'RoomService', 'ShoppingMall', 'FoodCourt', 'Spa', 'VRDeck']
    num_features = features[num_columns]

    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(num_features)
    num_features = pd.DataFrame(X, columns=num_columns, index=num_features.index)
    total_expense = num_features['RoomService'] + num_features['ShoppingMall'] + num_features['FoodCourt'] + num_features['Spa'] + num_features['VRDeck']
    features.insert(0, 'expense', total_expense)

    for i in num_columns:
        del features[i]
        features.insert(0, i, num_features[i])



In [5]:
# from the passengerID we can see what group the passenger belongs to, then can fill the null values for Home Planet and Destination
    #DONE, sorted the Home Planet by the group and Destination by Home Planet and whether the person was in cyro.
# can use the amount the peron used to fill VIP or in which group
    #DONE
# cryosleep can be determined how much they use
    #DONE, if expense is 0 then assume they are True, else False
# cabin can be determined from the group they reside in
# name prob just drop for now

In [6]:
def isCryo(features):
    indices = np.where(features['expense'] == 0)[0]
    X = features.copy()
    for i in indices:
        if pd.isnull(X.loc[i, 'CryoSleep']):
            X.loc[i,'CryoSleep'] = True

    X['CryoSleep'].fillna(value = False, inplace=True)

    return X

In [7]:
def fillVip(features):
    X = features.copy()

    money = ['RoomService', 'ShoppingMall', 'FoodCourt', 'Spa', 'VRDeck']
    storage = pd.DataFrame(columns=money)
    e = []
    change = []
    for i in range(len(features)):
        if pd.isnull(X.loc[i, 'VIP']):
            change.append(i)
        elif X.loc[i,'VIP']:
            e.append(X.loc[i,'expense'])
            add = [X.loc[i,x] for x in money]
            storage.loc[-1] = add
            storage.index = storage.index + 1
            storage = storage.sort_index()


    decision = np.mean(e)
    expensive_food = storage['FoodCourt'].mean()
    expensive_VR = storage['VRDeck'].mean()
    expensive_Spa = storage['Spa'].mean()
    for i in change:
        if X.loc[i, 'expense'] >= decision:
            X.loc[i, 'VIP'] = True
        elif X.loc[i, 'CryoSleep']:
            X.loc[i, 'VIP'] = False
        elif X.loc[i, 'FoodCourt'] > expensive_food:
            X.loc[i, 'VIP'] = True
        elif X.loc[i, 'VRDeck'] > expensive_VR:
            X.loc[i,'VIP'] = True
        elif X.loc[i, 'Spa'] > expensive_Spa:
            X.loc[i,'VIP'] = True
        else:
            X.loc[i, 'VIP'] = False
        
        
    
    return X

In [8]:
import re

def separateId(features):
    alone = np.zeros(len(features), dtype='int16')
    group = np.zeros(len(features), dtype='int16')
    idx = 0
    for info in features['PassengerId']:
        identification = re.split('_', info)
        group[idx] = int(identification[0])
        if int(identification[1]) == 1:
            alone[idx] = 1
        idx += 1
    
    features.insert(0,'group',group.tolist())
    features.insert(1, 'alone', alone.tolist())
    del features['PassengerId']

In [9]:
def separateCabin(features):
    deck = []
    num = []
    side = []
    for info in features['Cabin']:
        if pd.notnull(info):
            cabin = re.split('/', info)
            deck.append(cabin[0])
            num.append(int(cabin[1]))
            if(cabin[2] == 'S'):
                side.append(1)
            else:
                side.append(0)
        else:
            deck.append(None)
            num.append(None)
            side.append(None)
    features.insert(0, 'deck', deck)
    features.insert(1, 'num', num)
    features.insert(2, 'side', side)
    del features['Cabin']

In [10]:
def homeGuess(features, i):
    destination = features.loc[i, 'Destination']
    cyro = features.loc[i, 'CryoSleep']

    if pd.notnull(features.loc[i, 'deck']) and pd.isnull(features.loc[i, 'HomePlanet']):
        if features.loc[i, 'deck'] == 'G':
            return 'Earth'
        elif features.loc[i, 'deck'] == 'B' or features.loc[i, 'deck'] == 'C' or features.loc[i, 'deck'] == 'A':
            return 'Europa'
    elif pd.notnull(destination) and pd.notnull(cyro):
        if destination == 'TRAPPIST-1e' and not cyro:
            return 'Earth'
        elif destination == '55 Cancri e' and not cyro:
            return 'Earth'
        elif destination == 'PSO J318.5-22' and not cyro:
            return 'Mars'
        elif destination == 'TRAPPIST-1e' and cyro:
            return 'Mars'
        elif destination == '55 Cancri e' and cyro:
            return 'Europa'
        elif destination == 'PSO J318.5-22' and cyro:
            return 'Earth'
    
    return 'Earth'

def destinationGuess(features, i):
    home = features.loc[i, 'HomePlanet']
    cyro = features.loc[i, 'CryoSleep']
    if pd.notnull(features.loc[i,'deck']):
        if features.loc[i, 'deck'] == 'F'or features.loc[i, 'deck'] == 'E':
            return 'TRAPPIST-1e'
    elif pd.notnull(home) and pd.notnull(cyro):
        if home == 'Earth' and not cyro:
            return 'TRAPPIST-1e'
        elif home == 'Mars' and not cyro:
            return 'PSO J318.5-22'
        elif home == 'Europa' and not cyro:
            return 'TRAPPIST-1e'
        elif home == 'Earth' and cyro:
            return 'PSO J318.5-22'
        elif home == 'Mars' and cyro:
            return '55 Cancri e'
        elif home == 'Europa' and cyro:
            return '55 Cancri e'
    
    return 'TRAPPIST-1e'


In [11]:

def findHomeDestination(features):
    X = features.copy()
    info1 = {}
    info2 = {}
    change1 = []
    change2 = []
    home = ['Earth', 'Europa', 'Mars']
    dest = ['TRAPPIST-1e', '55 Cancri e', 'PSO J318.5-22']
    for i in range(len(features)):
        if(pd.notnull(X.loc[i, 'HomePlanet'])):
            if(X['group'][i] in info1):
                if X.loc[i, 'HomePlanet'] == 'Earth':
                    info1[X['group'][i]][0] += 1
                elif X.loc[i, 'HomePlanet'] == 'Europa':
                    info1[X['group'][i]][1] += 1
                else:
                    info1[X['group'][i]][2] += 1
            else:
                info1[X['group'][i]] = [0,0,0]
                if X.loc[i, 'HomePlanet'] == 'Earth':
                    info1[X['group'][i]][0] += 1
                elif X.loc[i, 'HomePlanet'] == 'Europa':
                    info1[X['group'][i]][1] += 1
                else:
                    info1[X['group'][i]][2] += 1
        else:
            change1.append(i)

        if(pd.notnull(X.loc[i, 'Destination'])):
            if(X.loc[i, 'Destination'] in info2):
                if X.loc[i, 'Destination'] == 'TRAPPIST-1e':
                    info2[X['group'][i]][0] += 1
                elif X.loc[i, 'Destination'] == '55 Cancri e':
                    info2[X['group'][i]][1] += 1
                else:
                    info2[X['group'][i]][2] += 1
            else:
                info2[X['group'][i]] = [0,0,0]
                if X.loc[i, 'Destination'] == 'TRAPPIST-1e':
                    info2[X['group'][i]][0] += 1
                elif X.loc[i, 'Destination'] == '55 Cancri e':
                    info2[X['group'][i]][1] += 1
                else:
                    info2[X['group'][i]][2] += 1
        else:
            change2.append(i)
 

    for i in change1:
        if X['group'][i] in info1:
            y1 = info1[X['group'][i]]
            X.loc[i, 'HomePlanet'] = home[y1.index(max(y1))]
        else:
            retval = homeGuess(X, i)
            if retval in home:
                X.loc[i, 'HomePlanet'] = retval

    for i in change2:
        if X['group'][i] in info2:
            y2 = info2[X['group'][i]]
            X.loc[i, 'Destination'] = dest[y2.index(max(y2))]
        else:
            X.loc[i, 'Destination'] = destinationGuess(X, i)

    return X


In [12]:
# m = ['true' if x else 'false' for x in cat_features['CryoSleep']]

# m = np.array(m)

# newCol = Counter(g1_features['HomePlanet'] + g1_features['Destination'] + m)
# print(newCol)

# print(newCol['EarthTRAPPIST-1efalse'] / (newCol['EarthTRAPPIST-1efalse'] + newCol['EarthTRAPPIST-1etrue']))
# print(newCol['MarsTRAPPIST-1efalse'] / (newCol['MarsTRAPPIST-1efalse'] + newCol['MarsTRAPPIST-1etrue']))
# print(newCol['EuropaTRAPPIST-1efalse']/ (newCol['EuropaTRAPPIST-1efalse'] + newCol['EuropaTRAPPIST-1etrue']))
# print(newCol['Earth55 Cancri efalse']/ (newCol['Earth55 Cancri efalse'] + newCol['Earth55 Cancri etrue']))
# print(newCol['Mars55 Cancri efalse'] / (newCol['Mars55 Cancri efalse'] + newCol['Mars55 Cancri etrue']))
# print(newCol['Europa55 Cancri efalse']/ (newCol['Europa55 Cancri etrue'] + newCol['Europa55 Cancri efalse']))
# print(newCol['EarthPSO J318.5-22false']/ (newCol['EarthPSO J318.5-22true'] + newCol['EarthPSO J318.5-22false']))
# print(newCol['MarsPSO J318.5-22false']/ (newCol['MarsPSO J318.5-22false'] + newCol['MarsPSO J318.5-22true']))
# print(newCol['EuropaPSO J318.5-22false']/ (newCol['EuropaPSO J318.5-22false'] + newCol['EuropaPSO J318.5-22true']))

In [13]:

def convertBool(features):
    result = features['CryoSleep'].astype(int)
    features['CryoSleep'] = result
    result = features['VIP'].astype(int)
    features['VIP'] = result

In [14]:
import random

def fillDeckSide(features):

    decks = ['A', 'B', 'C', 'D', 'E', 'F', 'G']

    for i in range(len(features)):
        if pd.isnull(features.loc[i,'side']):
            features.loc[i, 'side'] = random.randint(0,1)
        if pd.isnull(features.loc[i, 'deck']):
            selection = random.randint(0,6)
            features.loc[i, 'deck'] = decks[selection]

In [15]:
def makeRich(features):
    features['Rich'] = features['Spa'] + features['VRDeck'] + features['RoomService']

In [16]:
def isExpense(features):

    l = []
    l2 = []
    decision = features['expense'].mean()
    for i in range(len(features)):
        if features.loc[i, 'Rich'] > decision:
            l.append(1)
        else:
            l.append(0)
        if features.loc[i, 'Rich'] == 0 and not features.loc[i,'CryoSleep']:
            l2.append(1)
        else:
            l2.append(0)
    
    features['isExpense'] = l
    features['noSpending'] = l2

In [17]:
def getRoute(features):
    features['Route'] = features['HomePlanet'] + features['Destination'] + features['CryoSleep'].astype(str)

In [18]:
from sklearn.base import BaseEstimator, TransformerMixin

class dataFiller(BaseEstimator, TransformerMixin):
    def __init__ (self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        numericalImputer(X)
        X = isCryo(X)
        X = fillVip(X)
        separateId(X)
        separateCabin(X)
        X = findHomeDestination(X)
        getRoute(X)
        makeRich(X)
        isExpense(X)
        convertBool(X)
        fillDeckSide(X)
        del X['num']
        del X['Name']
        return X

In [19]:
n = dataFiller()
train_features = n.transform(train_features)
print(train_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   deck          8693 non-null   object 
 1   side          8693 non-null   float64
 2   group         8693 non-null   int64  
 3   alone         8693 non-null   int64  
 4   VRDeck        8693 non-null   float64
 5   Spa           8693 non-null   float64
 6   FoodCourt     8693 non-null   float64
 7   ShoppingMall  8693 non-null   float64
 8   RoomService   8693 non-null   float64
 9   Age           8693 non-null   float64
 10  expense       8693 non-null   float64
 11  HomePlanet    8693 non-null   object 
 12  CryoSleep     8693 non-null   int64  
 13  Destination   8693 non-null   object 
 14  VIP           8693 non-null   int64  
 15  Transported   8693 non-null   bool   
 16  Route         8693 non-null   object 
 17  Rich          8693 non-null   float64
 18  isExpense     8693 non-null 

In [20]:
model1 = train_features.copy()

In [21]:
# del model1['expense']
# del model1['ShoppingMall']
# # del model1['group']
# del model1['FoodCourt']
# del model1['deck']

In [22]:
c = model1.corr()
print(c['CryoSleep'].sort_values(ascending=False))

CryoSleep       1.000000
Transported     0.467230
side            0.017489
group          -0.004304
Age            -0.077806
alone          -0.081778
VIP            -0.093149
VRDeck         -0.197524
Spa            -0.203249
FoodCourt      -0.211061
ShoppingMall   -0.212977
noSpending     -0.226033
RoomService    -0.250173
isExpense      -0.319055
Rich           -0.337554
expense        -0.386081
Name: CryoSleep, dtype: float64


In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures

num_attribute = ['Rich', 'expense', 'group']
cat_attribute = ['HomePlanet', 'Route', 'isExpense', 'CryoSleep', 'noSpending', 'side']

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipeline, num_attribute),
    ('cat', OneHotEncoder(), cat_attribute)
])


In [24]:
print(Counter(model1['Transported']))

Counter({True: 4378, False: 4315})


In [25]:
model_prepared = full_pipeline.fit_transform(model1)

In [26]:
# model_dataFrame = pd.DataFrame(model_prepared, columns=[i for i in range(16)])

In [27]:
# model_dataFrame.describe()

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import GradientBoostingClassifier

test_poly = [i for i in range(3,6)]
test_c = [0.5, 0.7, 1]

params = [
    {"kernel": ["rbf"], "C": test_c}
]
# params2 = [
#     {"weights" : ['uniform', 'distance'], "n_neighbors" : [3,4,5,6]}
# ]
# params3 = [
#     # try 12 (3×4) combinations of hyperparameters
#     {'n_estimators': [50, 75, 100, 125, 150], 'max_features': [4,5,6,7]},
#     # then try 6 (2×3) combinations with bootstrap set as False
#     {'bootstrap': [False], 'n_estimators': [50, 75, 100, 125, 150], 'max_features': [4,5,6,7]},
# ]
params4 =[
    {'penalty' : ['l2'], 'max_iter' : [300, 400, 625, 1000, 1500, 2000, 2500], 'C' : [0.05, 0.07, 0.09, 0.1, 0.2, 0.3, 0.4], 'dual': [False]}
]

# gnb = GaussianNB()
svc_clf = SVC()
linear = LinearSVC()
# knn = KNeighborsClassifier()
rf = RandomForestClassifier()
# dt = DecisionTreeClassifier()


In [29]:
# grid_search = GridSearchCV(boost, param_grid=params, cv=10)
grid_search2 = GridSearchCV(svc_clf, param_grid=params, cv=10)
# grid_search3 = GridSearchCV(rf, param_grid=params3, cv=10)
grid_search4 = GridSearchCV(linear, param_grid=params4, cv=10)

In [30]:
# grid_search.fit(model_prepared, train_labels)
grid_search2.fit(model_prepared, train_labels)
# grid_search3.fit(model_prepared, train_labels)
grid_search4.fit(model_prepared, train_labels)

In [31]:
grid_search2.best_params_

{'C': 1, 'kernel': 'rbf'}

In [32]:
grid_search4.best_params_

{'C': 0.3, 'dual': False, 'max_iter': 300, 'penalty': 'l2'}

In [33]:
svm_clf = SVC(**grid_search2.best_params_)
# # knn = KNeighborsClassifier(**grid_search2.best_params_)
# rf = RandomForestClassifier(**grid_search3.best_params_)
linear = LinearSVC(**grid_search4.best_params_)
# boost = GradientBoostingClassifier(**grid_search.best_params_)

# forest_score = cross_val_score(rf, model_prepared, train_labels, cv=10)
linear_score = cross_val_score(linear, model_prepared, train_labels, cv=10)
svm_scores = cross_val_score(svm_clf, model_prepared, train_labels, cv=10)
# knn_scores = cross_val_score(knn, model_prepared, train_labels, cv=10)
# gnb_scores = cross_val_score(gnb, model_prepared, train_labels, cv=10)
# boost_score = cross_val_score(boost, model_prepared, train_labels, cv=10)

print(svm_scores.mean())
# print(knn_scores.mean())
# print(gnb_scores.mean())
print(linear_score.mean())
# print(forest_score.mean())
# print(boost_score.mean())

0.7912178088171105
0.7919054799412721


In [34]:
from sklearn.model_selection import cross_val_predict

svm_pred = cross_val_predict(svm_clf, model_prepared, train_labels, cv=5)
# rf_pred = cross_val_predict(rf, model_prepared, train_labels, cv=5)
linear_pred = cross_val_predict(linear, model_prepared, train_labels, cv=5)

In [35]:
from sklearn.metrics import confusion_matrix
confusion_matrix(train_labels, svm_pred)

array([[3483,  832],
       [ 986, 3392]])

In [36]:
# confusion_matrix(train_labels, rf_pred)

In [37]:
confusion_matrix(train_labels, linear_pred)

array([[3228, 1087],
       [ 751, 3627]])

In [38]:
from sklearn.metrics import precision_score, recall_score, f1_score
print(precision_score(train_labels, svm_pred))
print(recall_score(train_labels, svm_pred))
print(f1_score(train_labels, svm_pred))


0.803030303030303
0.7747830059387848
0.7886538014415252


In [39]:
print(precision_score(train_labels, rf_pred))
print(recall_score(train_labels, rf_pred))
print(f1_score(train_labels, rf_pred))

NameError: name 'rf_pred' is not defined

In [None]:
from sklearn.metrics import roc_auc_score
print(roc_auc_score(train_labels, svm_pred))
print(roc_auc_score(train_labels, rf_pred))


0.7973508647207385
0.7772303792215052


In [None]:
test_data = n.transform(test_data)

In [None]:
test_prepared = full_pipeline.fit_transform(test_data)

In [None]:
svm_clf.fit(model_prepared, train_labels)

In [None]:
result = linear.predict(test_prepared)
print(Counter(result))

Counter({True: 2214, False: 2063})


In [None]:
submission = pd.DataFrame()
submission.insert(0, 'PassengerId', final_index)
submission.insert(1, 'Transported', result)

In [None]:
submission.to_csv('answers3.csv', index=False)