In [1]:
import pandas as pd
import numpy as np

In [2]:
s = pd.Series([1, 2, 3, None])

In [3]:
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [4]:
s.dtype

dtype('float64')

In [5]:
s = pd.Series([1, 2, 3, None], dtype=pd.Int64Dtype())
s

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [6]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [7]:
s = pd.Series(['one', 'two', None, 'three'], dtype=pd.StringDtype())

In [8]:
s

0      one
1      two
2     <NA>
3    three
dtype: string

Extension types can be passed to the Series astype method, allowing you to convert
 easily as part of your data cleaning process:

In [9]:
df = pd.DataFrame({"A": [1, 2, None, 4],                    
"B": ["one", "two", "three", None],
 "C": [False, None, False, True]})

In [10]:
df['A'] = df['A'].astype('Int64')

## String Manipulation

 The re module functions fall into three categories: pattern matching, substitution,
 and splitting


In [11]:
import re

In [12]:
text = "foo    bar\t baz  \tqux"

In [14]:
re.split(r"\s+", text)

['foo', 'bar', 'baz', 'qux']

Creating a regex object with re.compile is highly recommended if you intend to
 apply the same expression to many strings;

In [16]:
regex = re.compile(r"\s+")

In [17]:
regex.split(text)

['foo', 'bar', 'baz', 'qux']

In [19]:
text = """Dave dave@google.com
 Steve steve@gmail.com
 Rob rob@gmail.com
 Ryan ryan@yahoo.com"""
pattern = r"[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}"

In [20]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [21]:
regex.findall(text)

['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']

In [23]:
first = regex.search(text)

In [24]:
text[first.start():first.end()]

'dave@google.com'

In [25]:
print(regex.sub("REDACTED", text))

Dave REDACTED
 Steve REDACTED
 Rob REDACTED
 Ryan REDACTED


##  String Functions in pandas

In [26]:
data = {"Dave": "dave@google.com", "Steve": "steve@gmail.com",        
"Rob": "rob@gmail.com", "Wes": np.nan}

In [29]:
data = pd.Series(data)

In [30]:
data.isna()

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

In [31]:
data.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes        NaN
dtype: object

In [32]:
data_as_str = data.astype('string')
data_as_str

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                 <NA>
dtype: string

In [33]:
data_as_str.str.contains('gmail')

Dave     False
Steve     True
Rob       True
Wes       <NA>
dtype: boolean

In [34]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [35]:
data.str.findall(pattern, flags=re.IGNORECASE)

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

![str](Assets\str_methods.png)

##  Categorical Data

In [37]:
values = pd.Series([0, 1, 0, 0] * 2)

In [38]:
dim = pd.Series(['apple', 'orange'])

In [39]:
dim.take(values)

0     apple
1    orange
0     apple
0     apple
0     apple
1    orange
0     apple
0     apple
dtype: object

In [42]:
fruits = ['apple', 'orange', 'apple', 'apple'] * 2
N = len(fruits)
rng = np.random.default_rng(seed=12345)
df = pd.DataFrame({'fruit': fruits,                   
'basket_id': np.arange(N),   
'count': rng.integers(3, 15, size=N),
 'weight': rng.uniform(0, 4, size=N)},
columns=['basket_id', 'fruit', 'count', 'weight'])

In [43]:
df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [44]:
df['fruit']

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: object

In [46]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']