In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, model_selection, decomposition, preprocessing
from matplotlib import pyplot as plt

In [2]:
data = pd.read_csv('./data/train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.shape

(891, 12)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
# 一是Cabin缺失值太多  二是是否生还与舱门无关 直接删掉 
data.drop(['Cabin'], axis=1, inplace=True)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 76.6+ KB


In [7]:
# Embarked 中又两个缺失值值 数量比较少 直接删掉
index = data[data['Embarked'].isnull()].index
data.drop(index=index, inplace=True)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 889 entries, 0 to 890
Data columns (total 11 columns):
PassengerId    889 non-null int64
Survived       889 non-null int64
Pclass         889 non-null int64
Name           889 non-null object
Sex            889 non-null object
Age            712 non-null float64
SibSp          889 non-null int64
Parch          889 non-null int64
Ticket         889 non-null object
Fare           889 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(4)
memory usage: 83.3+ KB


In [9]:
data.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

In [11]:
# 查看是否有异常值
data.describe([0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99]).T

Unnamed: 0,count,mean,std,min,1%,5%,10%,25%,50%,75%,90%,99%,max
PassengerId,889.0,446.0,256.998173,1.0,9.88,45.4,90.8,224.0,446.0,668.0,801.2,882.12,891.0
Survived,889.0,0.382452,0.48626,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
Pclass,889.0,2.311586,0.8347,1.0,1.0,1.0,1.0,2.0,3.0,3.0,3.0,3.0,3.0
Age,712.0,29.642093,14.492933,0.42,1.0,4.0,14.0,20.0,28.0,38.0,50.0,65.89,80.0
SibSp,889.0,0.524184,1.103705,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,5.0,8.0
Parch,889.0,0.382452,0.806761,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,4.0,6.0
Fare,889.0,32.096681,49.697504,0.0,0.0,7.225,7.55,7.8958,14.4542,31.0,77.2875,249.303304,512.3292


In [12]:
# 然后开始拆分训练集和测试集
# 先分训练集和测试集 
X = data.loc[:, data.columns != 'Survived']
y = data.loc[:, 'Survived']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=66)

In [29]:
# 样本有轻微的不均衡问题
y.value_counts()

0    549
1    340
Name: Survived, dtype: int64

In [13]:
for i in [X_train, X_test, y_train, y_test]:
    i.index = range(i.shape[0])

In [14]:
X_train.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,485,1,"Bishop, Mr. Dickinson H",male,25.0,1,0,11967,91.0792,C
1,289,2,"Hosono, Mr. Masabumi",male,42.0,0,0,237798,13.0,S
2,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,S
3,498,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,S
4,814,3,"Andersson, Miss. Ebba Iris Alfrida",female,6.0,4,2,347082,31.275,S


In [15]:
# 接下来开始填充年龄缺失值
X_train.loc[:, 'Age'] = X_train['Age'].fillna(X_train['Age'].mean())
X_test.loc[:, 'Age'] = X_test['Age'].fillna(X_train['Age'].mean())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [16]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 10 columns):
PassengerId    622 non-null int64
Pclass         622 non-null int64
Name           622 non-null object
Sex            622 non-null object
Age            622 non-null float64
SibSp          622 non-null int64
Parch          622 non-null int64
Ticket         622 non-null object
Fare           622 non-null float64
Embarked       622 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 48.7+ KB


In [17]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 267 entries, 0 to 266
Data columns (total 10 columns):
PassengerId    267 non-null int64
Pclass         267 non-null int64
Name           267 non-null object
Sex            267 non-null object
Age            267 non-null float64
SibSp          267 non-null int64
Parch          267 non-null int64
Ticket         267 non-null object
Fare           267 non-null float64
Embarked       267 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 20.9+ KB


In [28]:
# 接下来保存已经处理好的数据，为接下来的操作做准备
train_data = pd.concat([X_train, pd.DataFrame(y_train)], axis=1, sort=False)
test_data = pd.concat([X_test, pd.DataFrame(y_test)], axis=1, sort=False)

# 写入文件 暂时保存 方便下面处理
train_data.to_csv('./temp/train_data.csv')
test_data.to_csv('./temp/test_data.csv')