In [40]:
import numpy
from sklearn import datasets, utils
from sklearn.model_selection import cross_val_score
import pandas.io
from sklearn import tree, pipeline, preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.externals import joblib


#
# DECLARATION PART
#
PIPELINEPATH= "ser_pipeline.pickle"
DATASETPATH= "titanic.csv"



# if there is no header row with the column/attribute names, use the constant None for row_with_column_names
def readCsvToDataFrame(path):
    theDataFrame= pandas.read_csv(path)
    return theDataFrame



def show_df_info(dataframe):
    # get the data type
    print(type(dataframe))
    print("Amount of entries is %s" % dataframe.size)
    print("Dimensions= %i" % dataframe.ndim)
    print("Shape is ", end="")
    print(dataframe.shape)
    print("Axes: ", end="")
    print(dataframe.axes)
    print("Data types of columns:")
    print(dataframe.dtypes)
    print("Features: %s" % dataframe.columns)



def sliceDataFrame(df):
    # iloc arguments: range of rows, range of columns
    # class 'pandas.core.series.Series'
    return df.iloc[:, :-1], df.iloc[:, -1]

def tonumeric(df):
    for cols in df.columns:
        df[cols] = df[cols].apply(pandas.to_numeric,errors='ignore')
        return df

def onehotencoding(df):
    cols_two = df.columns.values[df.dtypes == object]
    df = pandas.get_dummies(df, columns=cols_two, drop_first=True)
    return df

#
# PROGRAM BODY
#

## PHASE 1: LOAD DATASET
dataset= readCsvToDataFrame(DATASETPATH)
##CLEAN
dataset = tonumeric(dataset)
##ONE HOT ENCODING
dataset = onehotencoding(dataset)
show_df_info(dataset)
print(dataset.head(5))

## PHASE 2: SLICE DATASET
training_instances, class_labels= sliceDataFrame(dataset)
show_df_info(training_instances)
# preview the data
print(training_instances.head(5))
print()
print(class_labels.head(5))


## PHASE 3: CREATE PIPELINE
cart_model= tree.DecisionTreeClassifier()
pipe= pipeline.Pipeline(steps=[("scale", preprocessing.StandardScaler()),  ("CART", cart_model)])

## PHASE 4: TRAIN
# fit all stages of the pipeline
pipe.fit(training_instances, y=class_labels)

## PHASE 5: EVALUATE
# return value is array of scores
scores = cross_val_score(pipe, training_instances, class_labels, cv=5)
# use as quality metric the average CV score
meanCvAccuracy= scores.mean()
print("Mean CV accuracy= %f" % meanCvAccuracy)

## PHASE 6: SAVE PIPELINE
# the whole pipeline in one single file
joblib.dump(pipe, PIPELINEPATH, compress = 1)

## PHASE 7: LOAD THE PIPELINE
# read the file and deserialize the pipeline
pipeline_loaded = joblib.load(PIPELINEPATH)

print("--- end of execution ---")

<class 'pandas.core.frame.DataFrame'>
Amount of entries is 8019
Dimensions= 2
Shape is (891, 9)
Axes: [RangeIndex(start=0, stop=891, step=1), Index(['PassengerId', 'Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Title',
       'FSize', 'Survived'],
      dtype='object')]
Data types of columns:
PassengerId      int64
Age            float64
Embarked         int64
Fare           float64
Pclass           int64
Sex              int64
Title            int64
FSize            int64
Survived       float64
dtype: object
Features: Index(['PassengerId', 'Age', 'Embarked', 'Fare', 'Pclass', 'Sex', 'Title',
       'FSize', 'Survived'],
      dtype='object')
   PassengerId   Age  Embarked     Fare  Pclass  Sex  Title  FSize  Survived
0            1  22.0         2   7.2500       3    1     12      1       0.0
1            2  38.0         0  71.2833       1    0     13      1       1.0
2            3  26.0         2   7.9250       3    0      9      0       1.0
3            4  35.0         2  53.1000    