In [23]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import SGDClassifier

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes=[]):
        self.attributes = attributes
        return None
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes]

def getAttributes(data, includeTypes=None, excludeTypes=None):
    return list(data.select_dtypes(include=includeTypes, exclude=excludeTypes).columns)

def readData(filename):
    return pd.read_csv(filename)

def dropData(data, toDrop=[]):
    data = data.drop(columns=toDrop)
    return data

def cleanData(data, toFill=[]):
    toFill = {name: data[name].mode for name in toFill}
    data = data.fillna(toFill)
    return data

dataset = readData("train.csv")

dropList = ["Cabin", "Ticket", "Name", "PassengerId"]
fillList = ["Embarked", "Sex"]
dataset = dropData(data=dataset, toDrop=dropList)
dataset = cleanData(data=dataset, toFill=fillList)
numCols = dataset.shape[1]

np.random.seed(42)

#Split the datset into train and test, then into features and labels
dataTrain, dataTest = train_test_split(dataset, test_size=.2)
labelsTrain = dataTrain.iloc[:, 0:1]
dataTrain = dataTrain.iloc[:, 1:]
labelsTest = dataTest.iloc[:, 0:0]
dataTest = dataTest.iloc[:, 1:]

numAttr = getAttributes(dataTrain, excludeTypes="object")
catAttr = getAttributes(dataTrain, includeTypes="object")

numPipeline = Pipeline([
    ('selector', DataFrameSelector(numAttr)),
    ('imputer', SimpleImputer(strategy="median")),
    ('standarization', StandardScaler())
])

catPipeline = Pipeline([
    ('selector', DataFrameSelector(catAttr)),
    ('Encoder', OneHotEncoder(sparse=False, categories="auto"))
])

fullPipeline = FeatureUnion(transformer_list=[
    ('numPipeline', numPipeline),
    ('catPipeline', catPipeline)
])

num = numPipeline.fit_transform(dataTrain)
#cat = catPipeline.fit_transform(dataTrain)
dataTrain.info()

"""
dataTrainPrepared = fullPipeline.fit_transform(dataTrain)
dataTestPrepared = fullPipeline.fit_transform(dataTest)

sgd = SGDClassifier(random_state=42)
sgd.fit(dataTrainPrepared, labelsTrain)
print(sgd.score(dataTestPrepared, labelsTest))
"""

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 331 to 102
Data columns (total 7 columns):
Pclass      712 non-null int64
Sex         712 non-null object
Age         572 non-null float64
SibSp       712 non-null int64
Parch       712 non-null int64
Fare        712 non-null float64
Embarked    712 non-null object
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB


'\ndataTrainPrepared = fullPipeline.fit_transform(dataTrain)\ndataTestPrepared = fullPipeline.fit_transform(dataTest)\n\nsgd = SGDClassifier(random_state=42)\nsgd.fit(dataTrainPrepared, labelsTrain)\nprint(sgd.score(dataTestPrepared, labelsTest))\n'

In [17]:
dataTrain["Embarked"] = dataTrain["Embarked"].apply(str)

In [19]:
dataTrain["Embarked"].value_counts()

S                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             525
C                                                                                                                                                                                                                                                                                                                     

In [24]:
labelsTrain

Unnamed: 0,Survived
331,0
733,0
382,0
704,0
813,0
118,0
536,0
361,0
29,0
55,1
