In [1]:
import numpy as np
import pandas as pd

## 文件操作

In [25]:
data = pd.DataFrame({
    'c1': np.arange(5),
    'c2': list('abcde'),
    'c3': np.linspace(0, 1, 5),
    'c4': ['red', 'yellow', 'blue', 'green', 'black'],
    'c5': np.arange('2021-12-25', '2021-12-30', dtype=np.datetime64)
})
data.to_csv('test.csv', index=False)

data

Unnamed: 0,c1,c2,c3,c4,c5
0,0,a,0.0,red,2021-12-25
1,1,b,0.25,yellow,2021-12-26
2,2,c,0.5,blue,2021-12-27
3,3,d,0.75,green,2021-12-28
4,4,e,1.0,black,2021-12-29


文件读取

- `pd.read_csv(filepath_or_buffer, sep=',', header='infer', names=None, index_col=None, usecols=None, dtype=None, skiprows=None, nrows=None, parse_dates=False, encoding=None)`
- `pd.read_table(...)`
- `pd.read_excel(io, sheet_name=0, header=0, names=None, index_col=None, usecols=None, dtype=None, skiprows=None, nrows=None, parse_dates=False)`

In [46]:
df = pd.read_csv('test.csv')
df

Unnamed: 0,c1,c2,c3,c4,c5
0,0,a,0.0,red,2021-12-25
1,1,b,0.25,yellow,2021-12-26
2,2,c,0.5,blue,2021-12-27
3,3,d,0.75,green,2021-12-28
4,4,e,1.0,black,2021-12-29


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   c1      5 non-null      int64  
 1   c2      5 non-null      object 
 2   c3      5 non-null      float64
 3   c4      5 non-null      object 
 4   c5      5 non-null      object 
dtypes: float64(1), int64(1), object(3)
memory usage: 328.0+ bytes


In [29]:
# 不将第一行作为表头
pd.read_csv('test.csv', header=None)

Unnamed: 0,0,1,2,3,4
0,c1,c2,c3,c4,c5
1,0,a,0.0,red,2021-12-25
2,1,b,0.25,yellow,2021-12-26
3,2,c,0.5,blue,2021-12-27
4,3,d,0.75,green,2021-12-28
5,4,e,1.0,black,2021-12-29


In [30]:
# 将某几列作为索引
pd.read_csv('test.csv', index_col=['c1', 'c2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,c3,c4,c5
c1,c2,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,a,0.0,red,2021-12-25
1,b,0.25,yellow,2021-12-26
2,c,0.5,blue,2021-12-27
3,d,0.75,green,2021-12-28
4,e,1.0,black,2021-12-29


In [32]:
# 只读取某几列
pd.read_csv('test.csv', usecols=['c1', 'c2'])

Unnamed: 0,c1,c2
0,0,a
1,1,b
2,2,c
3,3,d
4,4,e


In [40]:
# 解析时间戳
df = pd.read_csv('test.csv', parse_dates=['c5'])
df

Unnamed: 0,c1,c2,c3,c4,c5
0,0,a,0.0,red,2021-12-25
1,1,b,0.25,yellow,2021-12-26
2,2,c,0.5,blue,2021-12-27
3,3,d,0.75,green,2021-12-28
4,4,e,1.0,black,2021-12-29


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   c1      5 non-null      int64         
 1   c2      5 non-null      object        
 2   c3      5 non-null      float64       
 3   c4      5 non-null      object        
 4   c5      5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1), object(2)
memory usage: 328.0+ bytes


In [48]:
# 只读取前几行
pd.read_csv('test.csv', nrows=2)

Unnamed: 0,c1,c2,c3,c4,c5
0,0,a,0.0,red,2021-12-25
1,1,b,0.25,yellow,2021-12-26


文件写入

- `DataFrame.to_csv(path_or_buf=None, sep=',', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, mode='w', encoding=None, compression='infer', quoting=None, quotechar='"', line_terminator=None, chunksize=None, date_format=None, doublequote=True, escapechar=None, decimal='.', errors='strict', storage_options=None)`
- `DataFrame.to_excel(excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, merge_cells=True, encoding=None, inf_rep='inf', verbose=True, freeze_panes=None, storage_options=None)`
- `DataFrame.to_markdown(buf=None, mode='wt', index=True)`
- `DataFrame.to_latex(buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, bold_rows=False, column_format=None, longtable=None, escape=None, encoding=None, decimal='.', multicolumn=None, multicolumn_format=None, multirow=None, caption=None, label=None, position=None)`

In [None]:
# 不保存索引
df.to_csv('test.csv', index=False)

In [None]:
# 保存为txt
df.to_csv('test.txt', sep='\t', index=False)

In [53]:
# 转为markdown格式
print(df.to_markdown(index=False))

|   c1 | c2   |   c3 | c4     | c5         |
|-----:|:-----|-----:|:-------|:-----------|
|    0 | a    | 0    | red    | 2021-12-25 |
|    1 | b    | 0.25 | yellow | 2021-12-26 |
|    2 | c    | 0.5  | blue   | 2021-12-27 |
|    3 | d    | 0.75 | green  | 2021-12-28 |
|    4 | e    | 1    | black  | 2021-12-29 |


In [55]:
# 转为latex格式
print(df.to_latex(index=False))

\begin{tabular}{rlrll}
\toprule
 c1 & c2 &   c3 &     c4 &         c5 \\
\midrule
  0 &  a & 0.00 &    red & 2021-12-25 \\
  1 &  b & 0.25 & yellow & 2021-12-26 \\
  2 &  c & 0.50 &   blue & 2021-12-27 \\
  3 &  d & 0.75 &  green & 2021-12-28 \\
  4 &  e & 1.00 &  black & 2021-12-29 \\
\bottomrule
\end{tabular}



## 数据结构

### Series

### DataFrame