# Preprocessing

In [None]:
# 讀取資料並查看資訊train set資訊
import pandas as pd
df_train = pd.read_csv('train.csv')
df_train_nosurvived = df_train.drop('Survived',axis=1)
test = pd.read_csv('test.csv')
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [None]:
# 讀取資料並查看資訊test set資訊
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
# 由於發現test set的欄位Fare有空值 因此使用平均數來補值
test_na = test.isna()
for i in range(len(test_na)):
  if test_na.loc[i, 'Fare'] == True:
    test.loc[i, 'Fare'] = round(test['Fare'].mean())
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         418 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [None]:
# 將train 、test set合併，一同進行Encoding及standardization
df_all = pd.concat([df_train_nosurvived,test],axis=0)
df_all = df_all.reset_index(drop=True)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Pclass       1309 non-null   int64  
 2   Name         1309 non-null   object 
 3   Sex          1309 non-null   object 
 4   Age          1046 non-null   float64
 5   SibSp        1309 non-null   int64  
 6   Parch        1309 non-null   int64  
 7   Ticket       1309 non-null   object 
 8   Fare         1309 non-null   float64
 9   Cabin        295 non-null    object 
 10  Embarked     1307 non-null   object 
dtypes: float64(2), int64(4), object(5)
memory usage: 112.6+ KB


In [None]:
# 查看Age與其他欄位的相關係數 以決定該如何補值
df_all_corr = df_all.corr().abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
df_all_corr.rename(columns={"level_0": "Feature 1", "level_1": "Feature 2", 0: 'Correlation Coefficient'}, inplace=True)
df_all_corr[df_all_corr['Feature 1'] == 'Age']

Unnamed: 0,Feature 1,Feature 2,Correlation Coefficient
3,Age,Age,1.0
9,Age,Pclass,0.408106
12,Age,SibSp,0.243699
17,Age,Fare,0.178328
20,Age,Parch,0.150917
30,Age,PassengerId,0.028814


In [None]:
# 使用Pclass及Sex填補Age的空值
age_by_pclass_sex = df_all.groupby(['Sex', 'Pclass']).median()['Age']

for pclass in range(1, 4):
    for sex in ['female', 'male']:
        print('Median age of Pclass {} {}s: {}'.format(pclass, sex, age_by_pclass_sex[sex][pclass]))
print('Median age of all passengers: {}'.format(df_all['Age'].median()))

# Filling the missing values in Age with the medians of Sex and Pclass groups
df_all['Age'] = df_all.groupby(['Sex', 'Pclass'])['Age'].apply(lambda x: x.fillna(x.median()))

Median age of Pclass 1 females: 36.0
Median age of Pclass 1 males: 42.0
Median age of Pclass 2 females: 28.0
Median age of Pclass 2 males: 29.5
Median age of Pclass 3 females: 22.0
Median age of Pclass 3 males: 25.0
Median age of all passengers: 28.0


In [None]:
# 對資料進行前處理
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics, preprocessing

#One-Hot Encoding
df_all2 = df_all.drop(['Name','Ticket','Cabin'],axis=1)
data_dum = pd.get_dummies(df_all2)

# #標準化(standardization)
data_dum2 = data_dum.drop('PassengerId',axis=1)
standardscaler = preprocessing.StandardScaler()
data_standard = standardscaler.fit_transform(data_dum2)
data_standard = pd.DataFrame(data_standard, columns=['Pclass','Age','SibSp','Parch','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S'])
data_standard['PassengerId'] = data_dum['PassengerId']

# 將test set的資料存在變數test中
test = data_standard.loc[891:]
test = test.reset_index(drop=True)
data_standard.drop(range(891,1309),inplace=True)

xs = data_standard[['Pclass', 'Sex_female', 'Sex_male', 'Age', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'SibSp', 'Parch', 'Fare']]
y = df_train['Survived']

train_X, test_X, train_y, test_y = train_test_split(xs, y, test_size = 0.2, random_state=0)
data_standard['Survived'] = df_train['Survived']
data_standard

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,PassengerId,Survived
0,0.841916,-0.549555,0.481288,-0.445000,-0.503634,-0.743497,0.743497,-0.50977,-0.322040,0.657394,1,0
1,-1.546098,0.661353,0.481288,-0.445000,0.734462,1.344995,-1.344995,1.96167,-0.322040,-1.521159,2,1
2,0.841916,-0.246828,-0.479087,-0.445000,-0.490583,1.344995,-1.344995,-0.50977,-0.322040,0.657394,3,1
3,-1.546098,0.434307,0.481288,-0.445000,0.382884,1.344995,-1.344995,-0.50977,-0.322040,0.657394,4,1
4,0.841916,0.434307,-0.479087,-0.445000,-0.488166,-0.743497,0.743497,-0.50977,-0.322040,0.657394,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...
886,-0.352091,-0.171147,-0.479087,-0.445000,-0.392457,-0.743497,0.743497,-0.50977,-0.322040,0.657394,887,0
887,-1.546098,-0.776601,-0.479087,-0.445000,-0.063759,1.344995,-1.344995,-0.50977,-0.322040,0.657394,888,1
888,0.841916,-0.549555,0.481288,1.866526,-0.190404,1.344995,-1.344995,-0.50977,-0.322040,0.657394,889,0
889,-1.546098,-0.246828,-0.479087,-0.445000,-0.063759,-0.743497,0.743497,1.96167,-0.322040,-1.521159,890,1


# Model

In [None]:
# 查看各變數與Survived間的相關係數 以決定該使用哪些變數
data_standard.corr()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,PassengerId,Survived
Pclass,1.0,-0.417667,0.083081,0.018443,-0.5495,-0.1319,0.1319,-0.243292,0.221009,0.08172,-0.035144,-0.338481
Age,-0.417667,1.0,-0.249747,-0.176733,0.124061,-0.101354,0.101354,0.042469,-0.090488,0.011861,0.038742,-0.058635
SibSp,0.083081,-0.249747,1.0,0.414838,0.159651,0.114631,-0.114631,-0.059528,-0.026354,0.070941,-0.057527,-0.035322
Parch,0.018443,-0.176733,0.414838,1.0,0.216225,0.245489,-0.245489,-0.011069,-0.081228,0.063036,-0.001652,0.081629
Fare,-0.5495,0.124061,0.159651,0.216225,1.0,0.182333,-0.182333,0.269335,-0.117216,-0.166603,0.012658,0.257307
Sex_female,-0.1319,-0.101354,0.114631,0.245489,0.182333,1.0,-1.0,0.082853,0.074115,-0.125722,-0.042939,0.543351
Sex_male,0.1319,0.101354,-0.114631,-0.245489,-0.182333,-1.0,1.0,-0.082853,-0.074115,0.125722,0.042939,-0.543351
Embarked_C,-0.243292,0.042469,-0.059528,-0.011069,0.269335,0.082853,-0.082853,1.0,-0.148258,-0.778359,-0.001205,0.16824
Embarked_Q,0.221009,-0.090488,-0.026354,-0.081228,-0.117216,0.074115,-0.074115,-0.148258,1.0,-0.496624,-0.033606,0.00365
Embarked_S,0.08172,0.011861,0.070941,0.063036,-0.166603,-0.125722,0.125722,-0.778359,-0.496624,1.0,0.022148,-0.15566


In [None]:
#svm
from sklearn.svm import SVC
from sklearn import metrics

svm = SVC(kernel='linear', probability=True)
svm.fit(train_X[['Sex_female','Sex_female','Pclass','Fare']], train_y)

test_y_predicted = svm.predict(test_X[['Sex_female','Sex_female','Pclass','Fare']])
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

svm = SVC(kernel='linear', probability=True)
svm.fit(xs[['Sex_female','Sex_female','Pclass','Fare']], y)

test_model = test.drop('PassengerId',axis=1)
test_set_predicted = svm.predict(test_model[['Sex_female','Sex_female','Pclass','Fare']])
svm_predict = test['PassengerId']
svm_predict = pd.DataFrame(svm_predict)
svm_predict['Survived'] = test_set_predicted
svm_predict.to_csv('svm_predict3.csv', index=False)

0.7877094972067039


In [None]:
#logistic
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

clf = LogisticRegression(random_state=0).fit(train_X[['Sex_female','Sex_female','Pclass','Fare']], train_y)

test_y_predicted = clf.predict(test_X[['Sex_female','Sex_female','Pclass','Fare']])
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

clf = LogisticRegression(random_state=0).fit(xs[['Sex_female','Sex_female','Pclass','Fare']], y)

test_model = test.drop('PassengerId',axis=1)
test_set_predicted = clf.predict(test_model[['Sex_female','Sex_female','Pclass','Fare']])
logistic_predict = test['PassengerId']
logistic_predict = pd.DataFrame(logistic_predict)
logistic_predict['Survived'] = test_set_predicted
logistic_predict.to_csv('logistic_predict3.csv', index=False)

0.7877094972067039


In [None]:
#rf
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

forest = RandomForestClassifier(n_estimators = 250,min_samples_split=20)
forest_fit = forest.fit(train_X[['Sex_female','Sex_female','Pclass','Fare']], train_y)

test_y_predicted = forest_fit.predict(test_X[['Sex_female','Sex_female','Pclass','Fare']])
accuracy = metrics.accuracy_score(test_y, test_y_predicted)
print(accuracy)

forest_fit = forest.fit(xs[['Sex_female','Sex_female','Pclass','Fare']], y)

test_model = test.drop('PassengerId',axis=1)
test_set_predicted = forest_fit.predict(test_model[['Sex_female','Sex_female','Pclass','Fare']])
rf_predict = test['PassengerId']
rf_predict = pd.DataFrame(rf_predict)
rf_predict['Survived'] = test_set_predicted
rf_predict.to_csv('rf_predict3.csv', index=False)
rf_predict

0.8212290502793296


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
