In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression

# 1. Introduction:
- Titanic survival prediction is a challenge in Kaggle which predict a person survived or not provided his or her feature
- Link: https://www.kaggle.com/c/titanic/data
- In this exercise we will use pandas to processing data (in format of a table) and LogisticRegression to train on this data
- First you need to follow some basic tutorial of pandas: https://www.kaggle.com/learn/pandas (Don't need to learn all, in this exercise, we recommend you to learn part 1: Creating, Reading, Writing workbook and part 2: Indexing, Selecting and Assigning)


# 2. Load data:
- Load titanic_train from csv file
- After processing, data contains 183 rows and 7 columns
- Training data is stored in a dataframe variable called training_data and training label is stored in a variable called training_label

In [4]:
titanic_df = pd.read_csv("titanic_train.csv", index_col=None)
pd.set_option("display.max_rows", 10)

titanic_data = titanic_df.dropna().drop(['Survived','Name', 'PassengerId','Ticket', 'Cabin'], axis=1)
titanic_label = titanic_df.dropna()['Survived']

titanic_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,female,38.0,1,0,71.2833,C
3,1,female,35.0,1,0,53.1000,S
6,1,male,54.0,0,0,51.8625,S
10,3,female,4.0,1,1,16.7000,S
11,1,female,58.0,0,0,26.5500,S
...,...,...,...,...,...,...,...
871,1,female,47.0,1,1,52.5542,S
872,1,male,33.0,0,0,5.0000,S
879,1,female,56.0,0,1,83.1583,C
887,1,female,19.0,0,0,30.0000,S


In [5]:
titanic_label

1      1
3      1
6      0
10     1
11     1
      ..
871    1
872    0
879    1
887    1
889    1
Name: Survived, Length: 183, dtype: int64

# 3. Converting Sex and Emabarked to numeric
- Unfortunately, machine learning models in sklearn don't accept data which is not number, such as: Sex (female, male), Embark (S,C,Q)
- You need to convert them to number with the following rules:
    - Sex:
        - Female: 0
        - Male : 1
    - Embarked:
        - C: 0
        - Q: 1
        - S: 2
            
- Output will look like picture "titanic_data_numeric.png"
- Python dictionary is a list that you can access element by its key

In [6]:
sex_dictionary = {'male':1, 'female':0}
embark_dictionary = {'C':0, 'Q':1, 'S':2}

- Iterate over each row of titanic_data where index variable is index of each row at each loop and row variable is data of each row
- Read this link: https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas
- Convert data of Sex and Embarked columns to numeric using sex_dictionary and embark_dictionary and assign to corresponding row of titanic_data_numeric variable
- Hint: Use dataframe.loc[] to access to an element at each row

In [7]:
titanic_data_numeric = titanic_data.copy()

for index, row in titanic_data.iterrows():
    titanic_data_numeric.loc[index, 'Sex'] = sex_dictionary[row['Sex']]
    titanic_data_numeric.loc[index, 'Embarked'] = embark_dictionary[row['Embarked']]



In [8]:
titanic_data_numeric

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
1,1,0,38.0,1,0,71.2833,0
3,1,0,35.0,1,0,53.1000,2
6,1,1,54.0,0,0,51.8625,2
10,3,0,4.0,1,1,16.7000,2
11,1,0,58.0,0,0,26.5500,2
...,...,...,...,...,...,...,...
871,1,0,47.0,1,1,52.5542,2
872,1,1,33.0,0,0,5.0000,2
879,1,0,56.0,0,1,83.1583,0
887,1,0,19.0,0,0,30.0000,2


# 4. Train Logistic Regression
- Define model
- Fit titanic_data_numeric, titanic_label into this model

In [9]:
logistic_reg = LogisticRegression()
logistic_reg.fit(titanic_data_numeric, titanic_label)
print(logistic_reg.score(titanic_data_numeric, titanic_label))

0.75956284153


In [10]:
# make prediction on first 10 training examples
data_first_ten = titanic_data_numeric.head(10)
true_label_first_ten = titanic_label.head(10)
predicted_label_first_ten = logistic_reg.predict(data_first_ten)

data_first_ten['True Label'] = true_label_first_ten
data_first_ten['Predicted Label'] = predicted_label_first_ten

data_first_ten

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,True Label,Predicted Label
1,1,0,38.0,1,0,71.2833,0,1,1
3,1,0,35.0,1,0,53.1,2,1,1
6,1,1,54.0,0,0,51.8625,2,0,0
10,3,0,4.0,1,1,16.7,2,1,1
11,1,0,58.0,0,0,26.55,2,1,1
21,2,1,34.0,0,0,13.0,2,1,0
23,1,1,28.0,0,0,35.5,2,1,0
27,1,1,19.0,3,2,263.0,2,0,1
52,1,0,49.0,1,0,76.7292,0,1,1
54,1,1,65.0,0,1,61.9792,0,0,0
