In [2]:
import numpy as np
import pandas as pd

train = pd.read_csv("train.csv", header=0)
test = pd.read_csv("test.csv", header=0)

test["Survived"] = 2

result = [train, test]

titanic = pd.concat(result)

In [3]:
len(titanic)

1309

In [4]:
#transform Sex to int
titanic.loc[titanic["Sex"]=="male", "Sex"] = 0
titanic.loc[titanic["Sex"]=="female", "Sex"] = 1
    
#fill NA in Embarked and Fare
titanic["Embarked"] = titanic["Embarked"].fillna("S")  # fill na with the most common value of Embarked, S
titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median()) #fill na with the median value of Fare
    
#transform Embarked to int
titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
    
#add FamilySize feature: FamilySize = SipSp + Parch +1
titanic["FamilySize"] = titanic["SibSp"].astype(int) + titanic["Parch"].astype(int) + 1


In [5]:
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize
0,22,,0,7.25,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2
1,38,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2
2,26,,0,7.925,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1
3,35,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2
4,35,,0,8.05,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1


In [1]:
import re

# A function to get the title from a name.
def get_title(name):
    # Use a regular expression to search for a title.  
    #Titles always consist of capital and lowercase letters, and end with a period.
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1)
    return ""

In [7]:
# Get all the titles and print how often each one occurs.
titles = titanic["Name"].apply(get_title)
print(pd.value_counts(titles))

Mr          757
Miss        260
Mrs         197
Master       61
Dr            8
Rev           8
Col           4
Major         2
Mlle          2
Ms            2
Don           1
Capt          1
Lady          1
Sir           1
Countess      1
Jonkheer      1
Dona          1
Mme           1
dtype: int64


In [8]:
titles.loc[titles=="Mme"] = "Mlle"

titles.loc[titles=="Capt"] = "Sir"
titles.loc[titles=="Major"] = "Sir"
titles.loc[titles=="Don"] = "Sir"

titles.loc[titles=="Dona"] = "Lady"
titles.loc[titles=="Countess"] = "Lady"
titles.loc[titles=="Jonkheer"] = "Lady"

print(pd.value_counts(titles))

Mr        757
Miss      260
Mrs       197
Master     61
Dr          8
Rev         8
Sir         5
Col         4
Lady        4
Mlle        3
Ms          2
dtype: int64


In [9]:
# Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Sir": 7, 
                 "Col": 8, "Lady": 9, "Mlle": 10, "Ms": 11}
for k,v in title_mapping.items():
    titles[titles == k] = v

# Verify that we converted everything.
print(pd.value_counts(titles))

1     757
2     260
3     197
4      61
6       8
5       8
7       5
9       4
8       4
10      3
11      2
dtype: int64


In [10]:
titanic["Title"] = titles
titanic.to_csv("titanic1.csv", index = False)

In [11]:
titanic.head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22,,0,7.25,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2,1
1,38,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26,,0,7.925,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
4,35,,0,8.05,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1,1


In [2]:
def simpleCleanData(titanic):
    #transform Sex to int
    titanic.loc[titanic["Sex"]=="male", "Sex"] = 0
    titanic.loc[titanic["Sex"]=="female", "Sex"] = 1
    
    #fill NA in Embarked and Fare
    titanic["Embarked"] = titanic["Embarked"].fillna("S")  # fill na with the most common value of Embarked, S
    titanic["Fare"] = titanic["Fare"].fillna(titanic["Fare"].median()) #fill na with the median value of Fare
    
    #transform Embarked to int
    titanic.loc[titanic["Embarked"] == "S", "Embarked"] = 0
    titanic.loc[titanic["Embarked"] == "C", "Embarked"] = 1
    titanic.loc[titanic["Embarked"] == "Q", "Embarked"] = 2
    
    #add FamilySize feature: FamilySize = SipSp + Parch +1
    titanic["FamilySize"] = titanic["SibSp"].astype(int) + titanic["Parch"].astype(int) + 1
    
    # Get all the titles and print how often each one occurs.
    titles = titanic["Name"].apply(get_title)
    
    # title combination
    titles.loc[titles=="Mme"] = "Mlle"
    titles.loc[titles=="Capt"] = "Sir"
    titles.loc[titles=="Major"] = "Sir"
    titles.loc[titles=="Don"] = "Sir"
    titles.loc[titles=="Dona"] = "Lady"
    titles.loc[titles=="Countess"] = "Lady"
    titles.loc[titles=="Jonkheer"] = "Lady"
    
    # Map each title to an integer.  Some titles are very rare, and are compressed into the same codes as other titles.
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Sir": 7, 
                 "Col": 8, "Lady": 9, "Mlle": 10, "Ms": 11}
    for k,v in title_mapping.items():
        titles[titles == k] = v
    
    titanic["Title"] = titles
    
    return titanic

# predict age using other variables

In [12]:
#extract the training data for predicting Age
titanic_age = titanic[titanic["Age"].notnull()]

#titanic_age.head()
#len(titanic_age)
#titanic_age.to_csv("titanic_age.csv", index = False)

In [13]:
import numpy as np
from sklearn import cross_validation
from sklearn.tree import DecisionTreeRegressor

predictors = ["Pclass", "Sex", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "Title"]

#alg1 = linear_model.LinearRegression()

#alg1.fit(titanic_age[predictors], titanic_age["Age"])

#print(np.mean((alg1.predict(titanic_age[predictors]) - titanic_age["Age"]) ** 2))

alg_decisionTree = DecisionTreeRegressor(random_state=1, max_depth=6)

scores = cross_validation.cross_val_score(alg_decisionTree, titanic_age[predictors], titanic_age["Age"], cv=5, scoring="r2")

print(scores.mean())


0.383652235217


In [178]:
from sklearn.ensemble import RandomForestRegressor

alg_rf = RandomForestRegressor(random_state=1, n_estimators=150, min_samples_split=4, min_samples_leaf=2)

scores = cross_validation.cross_val_score(alg_rf, titanic_age[predictors], titanic_age["Age"], cv=5, scoring="r2")

print(scores.mean())


0.393073028905


# fill in missing ages with median values within each title group

In [14]:
titanic_age = titanic[titanic["Age"].notnull()]

In [15]:
len(titanic[titanic["Age"].isnull()])

263

In [3]:
def getMeanAges(titles, data):
    meanValues = {}
    for title in titles:
        meanValues[title] = data.loc[data["Title"]==title, "Age"].mean()
    return meanValues

In [22]:
titleNumbers = list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])

In [23]:
meanAges = getMeanAges(titleNumbers, titanic_age)

In [4]:
def fillAges(meanAges, data):
    for i in titleNumbers:
        data.loc[data["Title"]==i, "Age"] = meanAges[i]
    return data

In [27]:
restData = fillAges(meanAges, titanic.loc[titanic["Age"].isnull()])

In [28]:
len(restData)

263

In [29]:
updateTitanic = pd.concat([titanic_age, restData])

In [30]:
updateTitanic.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22,,0,7.25,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2,1
1,38,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26,,0,7.925,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
4,35,,0,8.05,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1,1
6,54,E46,0,51.8625,"McCarthy, Mr. Timothy J",0,7,1,0,0,0,17463,1,1
7,2,,0,21.075,"Palsson, Master. Gosta Leonard",1,8,3,0,3,0,349909,5,4
8,27,,0,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1,347742,3,3
9,14,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1,237736,2,3
10,4,G6,0,16.7,"Sandstrom, Miss. Marguerite Rut",1,11,3,1,1,1,PP 9549,3,2


In [31]:
updateTitanic = updateTitanic.sort("PassengerId")

In [32]:
updateTitanic.head(10)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22.0,,0,7.25,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2,1
1,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26.0,,0,7.925,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35.0,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
4,35.0,,0,8.05,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1,1
5,32.252151,,2,8.4583,"Moran, Mr. James",0,6,3,0,0,0,330877,1,1
6,54.0,E46,0,51.8625,"McCarthy, Mr. Timothy J",0,7,1,0,0,0,17463,1,1
7,2.0,,0,21.075,"Palsson, Master. Gosta Leonard",1,8,3,0,3,0,349909,5,4
8,27.0,,0,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1,347742,3,3
9,14.0,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1,237736,2,3


In [33]:
updateTitanic.to_csv("fillAgeWithMeans.csv", index=False)

In [None]:
def fillAgeWithMeansAccordingToTitle(data):
    # pre-Clean the original data 
    titanic = simpleCleanData(data)
    
    # get the data without missing age
    titanic_age = titanic[titanic["Age"].notnull()]
    
    titleNumbers = list([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
    meanAges = getMeanAges(titleNumbers, titanic_age)
    
    restData = fillAges(meanAges, titanic.loc[titanic["Age"].isnull()])
    
    updateTitanic = pd.concat([titanic_age, restData])
    
    updateTitanic = updateTitanic.sort("PassengerId")
    
    return updateTitanic

# making the number of survived and unsurvived passengers even#

In [34]:
updateTitanic.sample(20, replace=True)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
243,32.252151,,0,7.8875,"Hyman, Mr. Abraham",0,1135,3,0,0,2,3470,1,1
838,32.0,,0,56.4958,"Chip, Mr. Chang",0,839,3,0,0,1,1601,1,1
487,58.0,B37,1,29.7,"Kent, Mr. Edward Austin",0,488,1,0,0,0,11771,1,1
66,18.0,,2,7.8792,"Burns, Miss. Mary Delia",0,958,3,1,0,2,330963,1,2
324,39.0,,0,211.3375,"Kreuchen, Miss. Emilie",0,1216,1,1,0,2,24160,1,2
92,46.0,E31,0,61.175,"Chaffee, Mr. Herbert Fuller",0,93,1,0,1,0,W.E.P. 5734,2,1
57,25.0,F G63,0,7.65,"Abelseth, Mr. Olaus Jorgensen",0,949,3,0,0,2,348122,1,1
334,36.994118,,0,133.65,"Frauenthal, Mrs. Henry William (Clara Heinshei...",0,335,1,1,1,1,PC 17611,2,3
168,32.252151,,0,25.925,"Baumann, Mr. John D",0,169,1,0,0,0,PC 17318,1,1
597,49.0,,0,0.0,"Johnson, Mr. Alfred",0,598,3,0,0,0,LINE,1,1


In [35]:
len(updateTitanic.loc[updateTitanic["Survived"]==0]) - len(updateTitanic.loc[updateTitanic["Survived"]==1])

207

In [36]:
updateTitanic.loc[updateTitanic["Survived"]==1].head()

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
1,38,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26,,0,7.925,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
8,27,,0,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1,347742,3,3
9,14,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1,237736,2,3


In [37]:
sampleDfTrain = updateTitanic.loc[updateTitanic["Survived"]==1].sample(207, replace=True)

In [38]:
sampleDfTrain[:20]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
311,18.0,B57 B59 B63 B66,1,262.375,"Ryerson, Miss. Emily Borie",2,312,1,1,2,1,PC 17608,5,2
346,40.0,,0,13.0,"Smith, Miss. Marion Elsie",0,347,2,1,0,1,31418,1,2
472,33.0,,0,27.75,"West, Mrs. Edwy Arthur (Ada Mary Worth)",2,473,2,1,1,1,C.A. 34651,4,3
166,36.994118,E33,0,55.0,"Chibnall, Mrs. (Edith Martha Bowerman)",1,167,1,1,0,1,113505,2,3
518,36.0,,0,26.0,"Angle, Mrs. William A (Florence ""Mary"" Agnes H...",0,519,2,1,1,1,226875,2,3
720,6.0,,0,33.0,"Harper, Miss. Annie Jessie ""Nina""",1,721,2,1,0,1,248727,2,2
39,14.0,,1,11.2417,"Nicola-Yarred, Miss. Jamila",0,40,3,1,1,1,2651,2,2
141,22.0,,0,7.75,"Nysten, Miss. Anna Sofia",0,142,3,1,0,1,347081,1,2
489,9.0,,0,15.9,"Coutts, Master. Eden Leslie ""Neville""",1,490,3,0,1,1,C.A. 37671,3,4
669,36.994118,C126,0,52.0,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",0,670,1,1,1,1,19996,2,3


In [39]:
len(updateTitanic.loc[updateTitanic["Survived"]<>2])

891

In [40]:
updateTitanic.loc[updateTitanic["Survived"]<>2]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22.000000,,0,7.2500,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2,1
1,38.000000,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26.000000,,0,7.9250,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35.000000,C123,0,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
4,35.000000,,0,8.0500,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1,1
5,32.252151,,2,8.4583,"Moran, Mr. James",0,6,3,0,0,0,330877,1,1
6,54.000000,E46,0,51.8625,"McCarthy, Mr. Timothy J",0,7,1,0,0,0,17463,1,1
7,2.000000,,0,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,0,3,0,349909,5,4
8,27.000000,,0,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1,347742,3,3
9,14.000000,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1,237736,2,3


In [41]:
evenResponseTrain = pd.concat([updateTitanic.loc[updateTitanic["Survived"]<>2], sampleDfTrain])

In [42]:
evenResponseTrain

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket,FamilySize,Title
0,22.000000,,0,7.2500,"Braund, Mr. Owen Harris",0,1,3,0,1,0,A/5 21171,2,1
1,38.000000,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,1,1,1,PC 17599,2,3
2,26.000000,,0,7.9250,"Heikkinen, Miss. Laina",0,3,3,1,0,1,STON/O2. 3101282,1,2
3,35.000000,C123,0,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,1,1,1,113803,2,3
4,35.000000,,0,8.0500,"Allen, Mr. William Henry",0,5,3,0,0,0,373450,1,1
5,32.252151,,2,8.4583,"Moran, Mr. James",0,6,3,0,0,0,330877,1,1
6,54.000000,E46,0,51.8625,"McCarthy, Mr. Timothy J",0,7,1,0,0,0,17463,1,1
7,2.000000,,0,21.0750,"Palsson, Master. Gosta Leonard",1,8,3,0,3,0,349909,5,4
8,27.000000,,0,11.1333,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",2,9,3,1,0,1,347742,3,3
9,14.000000,,1,30.0708,"Nasser, Mrs. Nicholas (Adele Achem)",0,10,2,1,1,1,237736,2,3


In [43]:
evenResponseTrain.to_csv("evenResponseTrain.csv", index=False)

In [None]:
def evenNumOfReponseTrain(data):
    updateTitanic = fillAgeWithMeansAccordingToTitle(data)
    sampleDfTrain = updateTitanic.loc[updateTitanic["Survived"]==1].sample(207, replace=True)
    evenResponseTrain = pd.concat([updateTitanic.loc[updateTitanic["Survived"]<>2], sampleDfTrain])
    
    return evenResponseTrain