In [45]:
#%%writefile titanic-pre.py

# %load titanic-pre.py
# Data Preprocessing Template


# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# common parameters

missing_values_not_applicable = 0
missing_values_drop_rows = 1
missing_values_fill_mean = 2
missing_values_drop_column = 3
missing_values_not_decided = 4

# Importing the dataset
#dataset_complete = pd.read_csv('train.csv')
preprocessing_override = pd.read_csv('preprocessing_override.csv')

dataset_X = pd.read_csv('train.csv')
dataset_y = dataset_X['Survived']

dataset_X_verify = pd.read_csv('test.csv')

del dataset_X['Survived']
del preprocessing_override['Survived']

def preprocess_ind (dataset, override):

    unique_identification_cutoff = 0.01
    drop_column_cutoff = 0.75

    override_ind = 0
    encoding_override_ind = 1
    dropcolumn_override_ind = 2
    
    num_rows, num_columns = dataset.shape
    null_list = dataset.isnull().sum()
    encoding_override = list((override == encoding_override_ind).values)[0]
    drop_column_override = list((override == dropcolumn_override_ind).values)[0]

    return_list = []
    category_encoding = []
    missing_value_strategy = []
    drop_column_strategy = []
    normalize_strategy = []

#Inspect every columns
    for i in range(num_columns):
# Inspect uniquenes 

#   Make sure that the null values are not considered while finding out the count of unique values for the column
        presence_of_missing_values = False
        percentage_missing_values = 100*null_list[i]/num_rows
        column_with_notnull_values = dataset[dataset.columns[i]][dataset[dataset.columns[i]].notnull()==True]
        if null_list[i] != 0:
            presence_of_missing_values = True
    
# disparate_data_index is the ratio of number of unique values in the column to the number of rows. Lower values
# indicates potential of uniqueness. A value of 1 indicates that every value in the coulmn is different than the rest
        disparate_data_index = ((len(column_with_notnull_values.unique())) /(num_rows-null_list[i]))

# Determine if the column is a candidate for feature encoding
        if disparate_data_index > unique_identification_cutoff: 
            category_encoding.append(False)
        elif encoding_override[i]:
            category_encoding.append(False)
        else:
            category_encoding.append(True)
# Inspect the data type
        number_datatype = False
        if dataset[dataset.columns[i]].dtype in ('int64', 'float64'):
            number_datatype = True
    
        if presence_of_missing_values:
            if percentage_missing_values > 50:
#Set the missing value strategy to removing the column(or feature)
                missing_value_strategy.append(3)
            elif percentage_missing_values < 5:
#Set the missing value strategy to removing the rows with missing value
                missing_value_strategy.append(1)
            elif number_datatype:
#Set the missing value strategy to setting the value to mean of the column values
                missing_value_strategy.append(2)
            else:
#Set the missing value strategy to UNKNOWN. This is related to non-numeric fields
                missing_value_strategy.append(4)
        else:
#Set the missing value strategy to not applicable as there are no missing values
            missing_value_strategy.append(0)
    
        if disparate_data_index < drop_column_cutoff:
            drop_column_strategy.append(False)
        elif drop_column_override[i]:
            drop_column_strategy.append(False)
        else:
            drop_column_strategy.append(True)
        
        
        if category_encoding[i]:
            normalize_strategy.append(False)
        elif missing_value_strategy == 3:
            normalize_strategy.append(False)
        elif drop_column_strategy[i]:
            normalize_strategy.append(False)
        elif number_datatype:
            normalize_strategy.append(True)
        else:
            normalize_strategy.append(False)
            
    return_list.append(category_encoding)
    return_list.append(missing_value_strategy)
    return_list.append(drop_column_strategy)
    return_list.append(normalize_strategy)
    
    return (return_list)
    
preprocess_list = preprocess_ind(dataset_X, preprocessing_override)

category_encoding_ind = preprocess_list[0]
missing_values_strategy = preprocess_list[1]
drop_column_strategy = preprocess_list[2]
normalize_strategy = preprocess_list[3]


y = dataset_y.iloc[:].values
X = dataset_X.iloc[:, :].values
X_verify = dataset_X_verify.iloc[:, :].values



# Taking care of missing data by dropping the rows

missing_columns_drop_rows = list(np.where(np.array(missing_values_strategy) == missing_values_drop_rows))[0]

for i in missing_columns_drop_rows:
    indices_of_empty_rows = np.where((pd.isnull(X[:,i]) == True))[0]
    
    X = np.delete(X, indices_of_empty_rows , axis=0)
    y = np.delete(y, indices_of_empty_rows , axis=0)

    indices_of_empty_rows_test = np.where((pd.isnull(X_test[:,i]) == True))[0]
    X_verify = np.delete(X_verify, indices_of_empty_rows_test , axis=0)

    
    

# Taking care of missing data by filling with mean values

missing_columns_tobe_filled_with_mean = list(np.where(np.array(missing_values_strategy) == missing_values_fill_mean))[0]

from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis = 0)

for i in missing_columns_tobe_filled_with_mean:
    imputer = imputer.fit(X[:, i:i+1])
    X[:, i:i+1] = imputer.transform(X[:, i:i+1])
    
    imputer = imputer.fit(X_test[:, i:i+1])
    X_verify[:, i:i+1] = imputer.transform(X_verify[:, i:i+1])


# Taking care of missing data by dropping columns

missing_columns_drop_column = list(np.where(np.array(missing_values_strategy) == missing_values_drop_column))[0]

# Delete columns from X matching the index numbers in missing_columns_drop_column

X = np.delete(X, missing_columns_drop_column , axis=1)
X_verify = np.delete(X_verify, missing_columns_drop_column , axis=1)


# Delete items from missing_values_strategy corresponding to the columns dropped in X

missing_values_strategy = list(np.delete(np.array(missing_values_strategy), missing_columns_drop_column , axis=0))

# Delete items from category_encoding_ind corresponding to the columns dropped in X

category_encoding_ind = list(np.delete(np.array(category_encoding_ind), missing_columns_drop_column , axis=0))

# Delete items from drop_column_strategy corresponding to the columns dropped in X

drop_column_strategy = list(np.delete(np.array(drop_column_strategy), missing_columns_drop_column , axis=0))

# Delete items from normalize_strategy corresponding to the columns dropped in X
normalize_strategy = list(np.delete(np.array(normalize_strategy), missing_columns_drop_column , axis=0))



# Drop columns corresponding to the columns marked in preprocessing as they are not likely to be relevant

drop_column = list(np.where(np.array(drop_column_strategy) == True))[0]

#print (drop_column)

# Delete columns from X matching the index numbers in missing_columns_drop_column

X = np.delete(X, drop_column , axis=1)
X_verify = np.delete(X_verify, drop_column , axis=1)

# Delete items from missing_values_strategy corresponding to the columns dropped in X

missing_values_strategy = list(np.delete(np.array(missing_values_strategy), drop_column , axis=0))

# Delete items from category_encoding_ind corresponding to the columns dropped in X

category_encoding_ind = list(np.delete(np.array(category_encoding_ind), drop_column , axis=0))

# Delete items from normalize_strategy corresponding to the columns dropped in X

normalize_strategy = list(np.delete(np.array(normalize_strategy), drop_column , axis=0))

# Delete items from drop_column_strategy corresponding to the columns dropped in X

drop_column_strategy = list(np.delete(np.array(drop_column_strategy), missing_columns_drop_column , axis=0))

if not drop_column_strategy:
    print ("something wrong !")

# For unique string column that can be cosidered as classification, convert into classification encoding using onecode

category_encoding_columns = list(np.where(np.array(category_encoding_ind) == True))[0]

if (category_encoding_columns.any()):
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    labelencoder_X = LabelEncoder()

    for i in (category_encoding_columns):
        X[:, i] = labelencoder_X.fit_transform(X[:, i])
        
        X_verify_drop_rows = np.where((pd.isnull(X_verify[:,i]) == True))[0]
        if (X_verify_drop_rows.any()):
            X_verify = np.delete(X_verify, X_verify_drop_rows , axis=0)
        X_verify[:,i] = labelencoder_X.transform(X_verify[:, i])

    X_extract = X[:,category_encoding_columns]
    X_verify_extract = X_verify[:,category_encoding_columns]
    onehotencoder = OneHotEncoder(categorical_features = 'all')    
    X_extract_encoded = onehotencoder.fit_transform(X_extract).toarray()
    X_verify_extract_encoded = onehotencoder.fit_transform(X_verify_extract).toarray()

    X = np.delete(X, category_encoding_columns , axis=1)
    X_verify = np.delete(X_verify, category_encoding_columns , axis=1)
    category_encoding_ind = list(np.delete(np.array(category_encoding_ind), category_encoding_columns , axis=0))
# Delete items from normalize_strategy corresponding to the columns dropped in X
    normalize_strategy = list(np.delete(np.array(normalize_strategy), category_encoding_columns , axis=0))

    
    X = np.c_[X, X_extract_encoded]
    X_verify = np.c_[X_verify, X_verify_extract_encoded]

# For numeric columns, scale the values appropriately

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
normalize_strategy_columns = list(np.where(np.array(normalize_strategy) == True))[0]
for i in normalize_strategy_columns:

    X[:, [i]] = sc_X.fit_transform(X[:, i].reshape(-1, 1))
    X_verify_drop_rows = np.where((pd.isnull(X_verify[:,i]) == True))[0]
    if (X_verify_drop_rows.any()):
        X_verify = np.delete(X_verify, X_verify_drop_rows , axis=0)

    X_verify[:,[i]] = sc_X.transform(X_verify[:,i].reshape(-1,1))

#sc_y = StandardScaler()
#y_train = sc_y.fit_transform(y_train)
print ("All OK")

All OK




In [46]:
dataset_X_verify.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [64]:
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)


from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print (cm)
print (cr)
#for i in range(len(y_pred)):
#    print ("True y value : ", y_test[i],"Predicted y values : ", y_pred[i])

[[110  22]
 [ 31  60]]
             precision    recall  f1-score   support

          0       0.78      0.83      0.81       132
          1       0.73      0.66      0.69        91

avg / total       0.76      0.76      0.76       223



In [68]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print (cm)
print (cr)


[[111  21]
 [ 33  58]]
             precision    recall  f1-score   support

          0       0.77      0.84      0.80       132
          1       0.73      0.64      0.68        91

avg / total       0.76      0.76      0.75       223



In [62]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'linear', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

print (cm)
print (cr)


[[110  22]
 [ 31  60]]
             precision    recall  f1-score   support

          0       0.78      0.83      0.81       132
          1       0.73      0.66      0.69        91

avg / total       0.76      0.76      0.76       223



In [72]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

acc_svc_kernel = round(classifier.score(X_train, y_train) * 100, 2)
print (acc_svc_kernel)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print (cm)
print (cr)

84.83
[[112  20]
 [ 27  64]]
             precision    recall  f1-score   support

          0       0.81      0.85      0.83       132
          1       0.76      0.70      0.73        91

avg / total       0.79      0.79      0.79       223



In [60]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print (cm)
print (cr)


[[103  29]
 [ 25  66]]
             precision    recall  f1-score   support

          0       0.80      0.78      0.79       132
          1       0.69      0.73      0.71        91

avg / total       0.76      0.76      0.76       223



In [58]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)

classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print (cm)
print (cr)

[[109  23]
 [ 29  62]]
             precision    recall  f1-score   support

          0       0.79      0.83      0.81       132
          1       0.73      0.68      0.70        91

avg / total       0.77      0.77      0.77       223



In [70]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)

classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, classification_report
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print (cm)
print (cr)

[[111  21]
 [ 32  59]]
             precision    recall  f1-score   support

          0       0.78      0.84      0.81       132
          1       0.74      0.65      0.69        91

avg / total       0.76      0.76      0.76       223



In [None]:
# Visualising the Training set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

# Visualising the Test set results
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
