<a href="https://colab.research.google.com/github/YukiTsukisaka/plotly_demo/blob/main/titanic_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# データ分析デモ

## 準備

In [None]:
!pip install japanize-matplotlib # matplotlibの日本語（文字化け対策）
!pip install plotly              # plotlyのインストール

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## インポート

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px

## データ読込

In [None]:
train = pd.read_csv('./drive/MyDrive/Colab Notebooks/titanic/train.csv')                            # 学習データ
test = pd.read_csv('./drive//MyDrive//Colab Notebooks/titanic/test.csv')                            # テストデータ
gender_submission = pd.read_csv('./drive//MyDrive//Colab Notebooks/titanic/gender_submission.csv')  # サンプルサブミットデータ

## 簡単な可視化

In [None]:
# 10行だけ表示
train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [None]:
# 概要表示
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB



|  カラム名  | カラム名（日本語）  | 補足 |
| ---- | ---- | ---- |
| PassengerId | 乗客ID | |
| Survived | 生存 | 0が死亡、1が生存生存 |
| Pclass | チケットクラス | 1st 2nd 3rdの順のグレード|
| Name |名前 |
| Sex | 性別 |
| Age | 年齢 | 
| SibSp | 同乗した兄弟または配偶者の人数 |
| Parch |同乗した親または子供の人数 |
| Ticket | チケット番号 |
| Fare | 運賃 |
| Cabin | キャビン番号 |
| Embarked | 乗船港 |


In [None]:
# 統計情報を表示する（数値型のみ表示される）
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [None]:
# 統計情報を表示（オブジェクト型）
train.describe(include="O")

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


## plotlyによる可視化

In [None]:
# 0が死亡者 1が生存者
px.histogram(train, x='Survived', title='死亡者と生存者の数', color='Survived', text_auto='.2s')

In [None]:
# 死亡者と生存者割合を表示する
display(train['Survived'].value_counts()/len(train['Survived']))

0    0.616162
1    0.383838
Name: Survived, dtype: float64

### グラフからわかる事
|死亡者 | 生存者 |
| --- | --- |
| 549人 | 342人 | 
| 約61.6% | 約38.3% |

#### 6割の人が亡くなっている事がわかる。

In [None]:
px.histogram(train, x='Survived', title='男女別の死亡者と生存者の数', color='Survived', facet_col='Sex', text_auto='.2s')

In [None]:
display(pd.crosstab(train['Sex'], train['Survived']))

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,81,233
male,468,109


In [None]:
display(pd.crosstab(train['Sex'], train['Survived'], normalize='index'))

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.257962,0.742038
male,0.811092,0.188908


### グラフからわかる事
| 男性死亡者 | 男性生存者 | 女性死亡者 | 女性生存者 |
| --- | --- | --- |--- |
| 468人 | 109人 | 81人 | 233人 |
| 81.1% | 18.8% | 25.7% | 74.2% |

#### 男性の死亡者が多く、女性の生存者が多い

In [None]:
px.histogram(train, x='Survived', title='チケットクラス別と男女別の死亡者と生存者の数', color='Survived', facet_col='Pclass', facet_row='Sex', text_auto='.2s')

In [None]:
display(pd.crosstab([train['Sex'], train['Survived']], train['Pclass']))

Unnamed: 0_level_0,Pclass,1,2,3
Sex,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0,3,6,72
female,1,91,70,72
male,0,77,91,300
male,1,45,17,47


In [None]:
display(pd.crosstab(train['Pclass'], [train['Sex'], train['Survived']], normalize='index'))

Sex,female,female,male,male
Survived,0,1,0,1
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,0.013889,0.421296,0.356481,0.208333
2,0.032609,0.380435,0.494565,0.092391
3,0.14664,0.14664,0.610998,0.095723


In [None]:
px.sunburst(train, path=['Survived', 'Sex', 'Pclass'], title="チケットクラス別と男女別の死亡者と生存者の割合")

### グラフからわかる事
<!-- | 男性死亡者 | 男性生存者 | 女性死亡者 | 女性生存者 |
| --- | --- | --- |--- |
| 468人 | 109人 | 81人 | 233人 |
| 81.1% | 18.8% | 25.7% | 74.2% | -->

#### 1stの女性が最も生存している。3rdの男性が最も多く亡くなっている。クラスによって生存率が変わる事がわかる。

In [None]:
px.histogram(train, x='Age', title='乗船者の年齢と性別の分布', color='Survived', facet_col='Sex', facet_row='Pclass', barmode='overlay', nbins=30)

In [None]:
train['Age'] = train['Age'].fillna(0)
train['Age_Range'] = np.trunc(train['Age'] / 10) * 10
train[['Age_Range', 'Age']]

Unnamed: 0,Age_Range,Age
0,20.0,22.0
1,30.0,38.0
2,20.0,26.0
3,30.0,35.0
4,30.0,35.0
...,...,...
886,20.0,27.0
887,10.0,19.0
888,0.0,0.0
889,20.0,26.0


In [None]:
# マトリックス
px.scatter_matrix(train)