In [1]:
import numpy as np
import pandas as pd

In [3]:
# 将下面的字典创建为DataFrame
data = {"grammer":['Python', 'C', 'Java', 'GO', np.NaN, 'SQL', 'PHP', 'Python'],
       "score":[1.0, 2.0, np.NaN, 4.0, 5.0, 6.0, 7.0, 10.0]}
df = pd.DataFrame(data)
df

Unnamed: 0,grammer,score
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [4]:
# 提取含有字符串Python的行
df[df['grammer'] == 'Python']

Unnamed: 0,grammer,score
0,Python,1.0
7,Python,10.0


In [5]:
# 方法二：
resluts = df['grammer'].str.contains('Python')
resluts.fillna(value=False, inplace=True)
df[resluts]

Unnamed: 0,grammer,score
0,Python,1.0
7,Python,10.0


In [6]:
# 输出df的所有列名
df.columns

Index(['grammer', 'score'], dtype='object')

In [7]:
# 修改第二列列名为‘popularity'
df.rename(columns={'score':'popularity'}, inplace=True)

In [8]:
df

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [9]:
# 统计grammer列中每种编程语言出现的次数
df['grammer'].value_counts()

Python    2
C         1
Java      1
GO        1
SQL       1
PHP       1
Name: grammer, dtype: int64

In [10]:
# When filling using a DataFrame, replacement happens along the same column names and same indices
# ref:https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html?highlight=fillna#:~:text=When%20filling%20using%20a%20DataFrame%2C%20replacement%20happens%20along%20the%20same%20column%20names%20and%20same%20indices
df['popularity'] = df['popularity'].fillna(df['popularity'].interpolate())
df

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [11]:
# 提取popularity列中值大于3的行
df[df['popularity'] > 3]


Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [12]:
# 按照grammar列进行去除重复值
df.drop_duplicates(['grammer'])


Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0


In [13]:
# 计算popularity列平均值
df['popularity'].mean()


4.75

In [14]:
# 将grammer列转换为list
df['grammer'].to_list()


['Python', 'C', 'Java', 'GO', nan, 'SQL', 'PHP', 'Python']

In [15]:
# 查看数据行列数
df.shape

(8, 2)

In [16]:
# 提取popularity列值大于3小于7的行
df[(df['popularity'] > 3) & (df['popularity'] < 7)]


Unnamed: 0,grammer,popularity
3,GO,4.0
4,,5.0
5,SQL,6.0


In [17]:
# 交换两列位置
# 方法一
# axis = 0 表示作用于每一列的所有行
# axis = 1 表示作用域每一个行的所有列
temp = df['popularity']
df.drop(labels = ['popularity'], axis = 1, inplace = True)

In [18]:
df.insert(0, 'popularity', temp)
df

Unnamed: 0,popularity,grammer
0,1.0,Python
1,2.0,C
2,3.0,Java
3,4.0,GO
4,5.0,
5,6.0,SQL
6,7.0,PHP
7,10.0,Python


In [19]:
# 方法二
cols = df.columns[[1, 0]]
df = df[cols]
df

Unnamed: 0,grammer,popularity
0,Python,1.0
1,C,2.0
2,Java,3.0
3,GO,4.0
4,,5.0
5,SQL,6.0
6,PHP,7.0
7,Python,10.0


In [20]:
# 提取popularity列最大值所在行
df[df['popularity'] == df['popularity'].max()]

Unnamed: 0,grammer,popularity
7,Python,10.0
