In [2]:
import numpy as np
import pandas as pd

  from pandas.core.computation.check import NUMEXPR_INSTALLED


# 扩展数据类型

In [3]:
s = pd.Series([1,2,3,None])
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [4]:
s = pd.Series([1,2,3,None],dtype=pd.Int64Dtype)
s

  s = pd.Series([1,2,3,None],dtype=pd.Int64Dtype)


0       1
1       2
2       3
3    <NA>
dtype: Int64

In [5]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [6]:
s[3] is pd.NA

True

In [7]:
s = pd.Series([1,2,3,None],dtype='Int64')
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [8]:
s = pd.Series(['one','two','None','three'],dtype=pd.StringDtype())
s

0      one
1      two
2     None
3    three
dtype: string

# 字符串操作

## Python内置方法

In [9]:
val = "a,b, guido"

In [10]:
val.split(",")

['a', 'b', ' guido']

In [11]:
pieces = [x.strip() for x in val.split(",")]

pieces

['a', 'b', 'guido']

In [12]:
a,b,c = pieces
a+"::"+b+"::"+c

'a::b::guido'

In [13]:
"::".join(pieces)

'a::b::guido'

In [14]:
val.index(",")

1

In [15]:
val.find(":")

-1

In [16]:
val.count(",")

2

In [17]:
val.replace(",","::")

'a::b:: guido'

In [18]:
val.replace(","," ")

'a b  guido'

## 正则表达式

In [19]:
import re

In [20]:
text = "foo     bar\t baz   \tqux"

In [21]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

In [22]:
regex = re.compile(r"\s+")

In [23]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [24]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [25]:
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"
regex = re.compile(pattern,flags=re.IGNORECASE)

In [26]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [27]:
m = regex.search(text)
m

<re.Match object; span=(5, 20), match='dave@google.com'>

In [28]:
text[m.start():m.end()]

'dave@google.com'

In [29]:
print(regex.sub("REAA",text))

Dave REAA
Steve REAA
Rob REAA
Ryan REAA


In [30]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
regex = re.compile(pattern,flags=re.IGNORECASE)

In [31]:
m = regex.match("wesm@bright.net")
m.groups()

('wesm', 'bright', 'net')

In [32]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [33]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3",text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


## pandas的字符串函数

In [34]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",
        "Rob": "rob@gmail.com", "Wes": np.nan}
data = pd.Series(data)
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [35]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [36]:
data.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [37]:
data_as_string_ext = data.astype('string')

data_as_string_ext

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [38]:
data_as_string_ext.str.contains("gmail")

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [39]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

In [40]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
matches
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [41]:
data.str[:5]

Dave     dave@
Steve    steve
Rob      rob@g
Wes        NaN
dtype: object

In [42]:
dddd = data.str.extract(pattern, flags=re.IGNORECASE)

In [43]:
dddd

Unnamed: 0,0,1,2
Dave,dave,google,com
Steve,steve,gmail,com
Rob,rob,gmail,com
Wes,,,


## 分类数据

In [44]:
values = pd.Series(['apple','orange','apple','apple']*2)
values

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
dtype: object

In [45]:
pd.unique(values)

array(['apple', 'orange'], dtype=object)

In [46]:
values = pd.Series([0,1,0,0]*2)

In [47]:
dim = pd.Series(['apple','orange'])

In [48]:
values

0    0
1    1
2    0
3    0
4    0
5    1
6    0
7    0
dtype: int64

In [49]:
dim

0     apple
1    orange
dtype: object

In [51]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [52]:
fruits = ['apple','orange','apple','apple'] * 2

In [53]:
N = len(fruits)

In [54]:
rng = np.random.default_rng(seed=12345)

In [55]:
df = pd.DataFrame({
    'fruit':fruits,
    'basket_id': np.arange(N),
    'count': rng.integers(3,15,size=N),
    'weight': rng.uniform(0,4,size=N),
},
columns=['basket_id','fruit','count','weight']
)

In [56]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [57]:
fruit_cat = df['fruit'].astype('category')

In [58]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [59]:
c = fruit_cat.array

In [60]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [61]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [62]:
dict(enumerate(c.categories))

{0: 'apple', 1: 'orange'}