Names: Bornales, Amiel James; Santiago, JM; Yuvallos Jonn

In [70]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np

plt.style.use('ggplot')

# Fix the seed of the random number 
# generator so that your results will match ours
np.random.seed(5)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Get the datastuff

In [71]:
import pandas as pd

test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

#test.head(10)
#train.head(10)

### Visualize the datastuff

In [5]:
test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
print(train.shape, test.shape)

(891, 12) (418, 11)


# Cleaning the data

### Drop unneeded columns

In [72]:
del test['PassengerId']
del test['Ticket']
del test['Fare']
del test['Name']
del test['Cabin']

del train['PassengerId']
del train['Ticket']
del train['Fare']
del train['Name']
del train['Cabin']

### Check for tables with null values

In [73]:
for col in train.columns.values:
  if train[col].isnull().values.any():
    print("train: "+ col + " has null:", train[col].isnull().sum())
print()
for col in test.columns.values:
  if test[col].isnull().values.any():
    print("test: "+ col + " has null:", test[col].isnull().sum())
    


train: Age has null: 177
train: Embarked has null: 2

test: Age has null: 86


# Handling null values in the data

## Drop rows with null values
Though not recommended since theres a possibility of losing a lot of data, this is usually the easiest way of handling missing data.

In [74]:
train_null = train[pd.notnull(train['Age'])].copy()
train_null = train_null[pd.notnull(train_null['Embarked'])]

In [75]:
print(train_null.shape, train.shape)

(712, 7) (891, 7)


## Normalize the data
The models can't read characters or strings so we'll have to convert them into integer or float.

In [76]:
from sklearn.preprocessing import LabelEncoder
train_fill = train.copy()

encoder = LabelEncoder()
encoder.fit(train_null['Sex'])
train_null['Sex'] = encoder.transform(train_null['Sex'])

encoder.fit(train_null['Embarked'])
train_null['Embarked'] = encoder.transform(train_null['Embarked'])


# Normalize the Embarked into integer since the pandas median function doesnt accept characters/strings
# Convert null into 'NaN' string since the encoder doesnt accept null values

train_fill['Embarked'][pd.isnull(train_fill['Embarked'])] = 'NaN'
encoder.fit(train_fill['Embarked'])
train_fill['Embarked'] = encoder.transform(train_fill['Embarked'])

encoder.fit(train_fill['Sex'])
train_fill['Sex'] = encoder.transform(train_fill['Sex'])

train_null.head()
train_fill.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,1,22.0,1,0,2
1,1,1,0,38.0,1,0,0
2,1,3,0,26.0,0,0,2
3,1,1,0,35.0,1,0,2
4,0,3,1,35.0,0,0,2


# Populate null values
Since the Titanic Submission requires exactly 418 predictions, we cant just drop the rows containing null values. A solution for this is to replace the numerical values with the mean values, and the categorical values with the median. Another possible solution for this is to use regression to fill the values.

We will also be creating another model with the populated null values as the training data and compare the accuracy of the two models later on.

In [77]:
encoder = LabelEncoder()
encoder.fit(test['Sex'])
test['Sex'] = encoder.transform(test['Sex'])

encoder.fit(test['Embarked'])
test['Embarked'] = encoder.transform(test['Embarked'])

test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,1,34.5,0,0,1
1,3,0,47.0,1,0,2
2,2,1,62.0,0,0,1
3,3,1,27.0,0,0,2
4,3,0,22.0,1,1,2


In [78]:
from sklearn.preprocessing import Imputer


### Method 1.1: Mean Values
We will be using the mean values on the age since its a numerical value

In [79]:
#Initially used Imputer class, but turns out pandas has built in functions for this 
#imp = Imputer(strategy = 'mean', axis = 1)
#imp.fit(train_null['Age'])
#train_fill['Age'] = imp.transform(train['Age'].values.reshape(-1,1))

train_fill['Age'] = train_fill['Age'].fillna(train_fill['Age'].mean())
print(train_fill['Age'].isnull().sum())

0


### Method 1.2: Median Values

In [86]:


# At this point 'NaN' is considered as another category, and will not be considered as null later on, 
# we'll need to replace them with the actual Null value

# Look for the first index of null in the original dataset and use that as reference to find the category
# of the null value in the new dataset.
index = 0
for i in train['Embarked']:
    if pd.isnull(i):
        index = train_fill['Embarked'][index]
        break;
    else:
        index+=1
        
#Check if replace was success
train_fill['Embarked'].replace({index: np.nan}, regex=True, inplace = True)
print(train_fill['Embarked'].isnull().sum())

train_fill['Embarked'] = train_fill['Embarked'].fillna(train_fill['Embarked'].median())
print(train_fill['Embarked'].isnull().sum())

2
0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [92]:
train_fill.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,3.0
1,1,1,female,38.0,1,0,0.0
2,1,3,female,26.0,0,0,3.0
3,1,1,female,35.0,1,0,3.0
4,0,3,male,35.0,0,0,3.0


### Method 2: Regression

# Do some stuff with the train data
Split the training data to train and test to check for the accuracy

In [87]:
X_null = train_null.drop('Survived', axis=1)
y_null = train_null['Survived'].as_matrix()

X_fill = train_fill.drop('Survived', axis=1)
y_fill = train_fill['Survived'].as_matrix()


In [88]:
from sklearn.model_selection import train_test_split

# Build the model

### Model 1: Null values in the training data was removed

In [89]:
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X_null, y_null, test_size = 0.5)
#default is 2
min_samples_split = 4

#default is None
max_depth = 10

#default is None
min_impurity_split = .1

dtc = DecisionTreeClassifier(min_samples_split = min_samples_split, max_depth = max_depth, min_impurity_split = min_impurity_split)
dtc.fit(X_train,y_train)



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=0.1,
            min_samples_leaf=1, min_samples_split=4,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [90]:
pred = dtc.predict(X_test)
print(np.sum(pred == y_test)/len(y_test)*100)

80.3370786517


### Model 2: Null values in the training data was replaced with median and mode

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X_fill, y_fill, test_size = 0.5)

dtc = DecisionTreeClassifier(min_samples_split = min_samples_split, max_depth = max_depth, min_impurity_split = min_impurity_split)
dtc.fit(X_train,y_train)

ValueError: could not convert string to float: 'male'

In [73]:
pred = dtc.predict(test)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

In [74]:
pred.shape

(356,)