In [1]:
import pandas as pd

In [3]:
# Series 序列，用来存储任意类型一维数组，包含数值和索引，默认索引0，1，2，3...
s = pd.Series([1,2,3,4], index = ['a', 'b', 'c', 'd'])

In [7]:
s

a    1
b    2
c    3
d    4
dtype: int64

In [12]:
# DataFrame 二维数据表格
# 可以通过列表、字典、二维数组创建

# 1.列表创建
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]])
print(df)

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9


In [17]:
# 定义列索引和行索引
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9]], 
                  columns=["fir","sec","thir"],
                  index = ["一","二","三"])
print(df)

   fir  sec  thir
一    1    2     3
二    4    5     6
三    7    8     9


In [19]:
# 通过字典创建
dic = {'姓名':["Alan","Bob","Chole"],
       '性别':["female","male","male"]}
df = pd.DataFrame(dic, index = ["一","二","三"])
print(df)

      姓名      性别
一   Alan  female
二    Bob    male
三  Chole    male


In [21]:
# 二维数组创建DataFrame
import numpy as np

arr = np.arange(12).reshape(3,4)
df = pd.DataFrame(arr)
print(df)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


### 文件读取和写入

**读取excel、csv、txt**     
pd.read_excel()  
pd.read_csv()  
pd.read_table()    
**写入excel、csv、txt**  
df.to_excel("", index = False)  # index = False可以去除索引写入。  
df.to_csv("", sep = '\t', index = False) # sep可以自定义分割符，默认","。  
df.to_csv()也可以写入txt  

In [30]:
titanic = pd.read_csv("tatanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [31]:
titanic.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [32]:
titanic.to_excel("titanic.xlsx", sheet_name = "Passengers", index = False)

In [36]:
titanic = pd.read_excel("titanic.xlsx", sheet_name="Passengers")

In [38]:
titanic.info

<bound method DataFrame.info of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                   

In [45]:
ages = titanic['Age']
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [51]:
age_sex = titanic[['Age','Sex']]
age_sex.head()

Unnamed: 0,Age,Sex
0,22.0,male
1,38.0,female
2,26.0,female
3,35.0,female
4,35.0,male


In [52]:
age_sex.shape

(891, 2)

In [53]:
type(age_sex)

pandas.core.frame.DataFrame

In [78]:
# 大于35岁的乘客筛选
above_35 = titanic[titanic['Age'] > 35]
above_35.shape

(217, 12)

In [77]:
# 来自2，3等级仓的乘客
class_23 = titanic[titanic["Pclass"].isin([2, 3])]
class_23.shape

(675, 12)

In [76]:
# 去除空值，notna()
age_no_null = titanic[titanic['Age'].notna()]
age_no_null.shape

(714, 12)

In [81]:
# how do i select specific rows and columns 
# 如何筛选行和列？选取年龄大于35岁的所有人名
adult_names = titanic.loc[titanic['Age'] > 35, "Name"]
adult_names.head()

1     Cumings, Mrs. John Bradley (Florence Briggs Th...
6                               McCarthy, Mr. Timothy J
11                             Bonnell, Miss. Elizabeth
13                          Andersson, Mr. Anders Johan
15                     Hewlett, Mrs. (Mary D Kingcome) 
Name: Name, dtype: object

In [82]:
adult_names.shape

(217,)

在需要同时筛选行和列时，需要用到loc/iloc方法，loc/iloc[行, 列]，iloc一般传入数字当作筛选索引，loc传入具体的index字符

In [90]:
# 选取指定范围内dataframe子集
# 比如选取10-13行，3-5列
titanic.iloc[9:13, 2:5] = "a"
titanic.iloc[9:13, 2:5]

Unnamed: 0,Pclass,Name,Sex
9,a,a,a
10,a,a,a
11,a,a,a
12,a,a,a


In [89]:
titanic.iloc[0:1, 3] = "AAA"
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,AAA,male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


> **REMEMBER**    
> * When selecting subsets of data, square brackets [] are used.
> * Inside these brackets, you can use a single column/row label, a list of column/row labels, a slice of labels, a conditional expression or a colon.
> * Select specific rows and/or columns using loc when using the row and column names
> * Select specific rows and/or columns using iloc when using the positions in the table
> * You can assign new values to a selection based on loc/iloc.

In [None]:
https://pandas.pydata.org/docs/getting_started/intro_tutorials/04_plotting.html