In [1]:
import csv
import re

import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))

In [4]:
df.iloc[2:, 1] = np.nan

In [5]:
df.iloc[4:, 2] = np.nan

In [6]:
df

Unnamed: 0,0,1,2
0,-0.511687,-1.372841,-0.20262
1,0.600023,0.01018,1.492413
2,-0.552378,,-0.547733
3,0.375947,,-0.206487
4,1.367625,,
5,-2.240437,,


In [7]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,-0.511687,-1.372841,-0.20262
1,0.600023,0.01018,1.492413
2,-0.552378,0.01018,-0.547733
3,0.375947,0.01018,-0.206487
4,1.367625,0.01018,-0.206487
5,-2.240437,0.01018,-0.206487


In [8]:
df

Unnamed: 0,0,1,2
0,-0.511687,-1.372841,-0.20262
1,0.600023,0.01018,1.492413
2,-0.552378,,-0.547733
3,0.375947,,-0.206487
4,1.367625,,
5,-2.240437,,


In [9]:
df.fillna(method="ffill", axis="columns")

Unnamed: 0,0,1,2
0,-0.511687,-1.372841,-0.20262
1,0.600023,0.01018,1.492413
2,-0.552378,-0.552378,-0.547733
3,0.375947,0.375947,-0.206487
4,1.367625,1.367625,1.367625
5,-2.240437,-2.240437,-2.240437


In [2]:
data = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [3]:
data

Unnamed: 0,0,1,2,3
0,-2.257305,-1.308817,0.311177,-0.202300
1,-1.097795,-0.729065,-0.716832,0.455603
2,-0.769573,-1.585383,-0.080424,1.109786
3,-1.433135,-1.665224,0.327516,0.029996
4,1.622097,0.575633,1.091361,0.720993
...,...,...,...,...
995,-2.621759,-0.100597,0.711006,-1.445024
996,0.818218,0.275038,0.214087,2.282824
997,0.044944,-0.345687,0.975721,0.467557
998,-0.245695,-0.059463,1.099623,-1.000808


In [6]:
type(data.abs() > 3)

pandas.core.frame.DataFrame

In [9]:
data.sample()

Unnamed: 0,0,1,2,3
603,0.485164,-0.09558,-0.689332,0.52437


In [2]:
text = """Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com"""

In [3]:
pattern = r"([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})"

In [6]:
regex = re.compile(pattern, flags=re.IGNORECASE)

In [8]:
regex.findall(text)

[('dave', 'google', 'com'),
 ('steve', 'gmail', 'com'),
 ('rob', 'gmail', 'com'),
 ('ryan', 'yahoo', 'com')]

In [9]:
regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text)

'Dave Username: dave, Domain: google, Suffix: com\nSteve Username: steve, Domain: gmail, Suffix: com\nRob Username: rob, Domain: gmail, Suffix: com\nRyan Username: ryan, Domain: yahoo, Suffix: com'

In [11]:
print(regex.sub(r"Username: \1, Domain: \2, Suffix: \3", text))

Dave Username: dave, Domain: google, Suffix: com
Steve Username: steve, Domain: gmail, Suffix: com
Rob Username: rob, Domain: gmail, Suffix: com
Ryan Username: ryan, Domain: yahoo, Suffix: com


In [14]:
data = {
    "Dave": "dave@google.com",
    "Steve": "steve@gmail.com",
    "Rob": "rob@gmail.com",
    "Wes": np.nan,
}
data = pd.Series(data)

In [15]:
data

Dave     dave@google.com
Steve    steve@gmail.com
Rob        rob@gmail.com
Wes                  NaN
dtype: object

In [29]:
matches = data.str.findall(pattern, flags=re.IGNORECASE).str[0]
matches.str.get(1)

Dave     google
Steve     gmail
Rob       gmail
Wes         NaN
dtype: object

In [2]:
fruits = ["apple", "orange", "apple", "apple"] * 2

N = len(fruits)

rng = np.random.default_rng(seed=12345)

df = pd.DataFrame(
    {
        "fruit": fruits,
        "basket_id": np.arange(N),
        "count": rng.integers(3, 15, size=N),
        "weight": rng.uniform(0, 4, size=N),
    },
    columns=["basket_id", "fruit", "count", "weight"],
)

df

Unnamed: 0,basket_id,fruit,count,weight
0,0,apple,11,1.564438
1,1,orange,5,1.331256
2,2,apple,12,2.393235
3,3,apple,6,0.746937
4,4,apple,5,2.691024
5,5,orange,12,3.767211
6,6,apple,10,0.992983
7,7,apple,11,3.795525


In [3]:
fruit_cat = df['fruit'].astype('category')
fruit_cat

0     apple
1    orange
2     apple
3     apple
4     apple
5    orange
6     apple
7     apple
Name: fruit, dtype: category
Categories (2, object): ['apple', 'orange']

In [6]:
type(fruit_cat)

pandas.core.series.Series

In [7]:
type(fruit_cat.array)

pandas.core.arrays.categorical.Categorical

In [9]:
fruit_cat.array

['apple', 'orange', 'apple', 'apple', 'apple', 'orange', 'apple', 'apple']
Categories (2, object): ['apple', 'orange']

In [11]:
c = fruit_cat.array
type(c.categories)

pandas.core.indexes.base.Index

In [14]:
c.categories

Index(['apple', 'orange'], dtype='object')

In [12]:
type(c.codes)

numpy.ndarray

In [15]:
c.codes

array([0, 1, 0, 0, 0, 1, 0, 0], dtype=int8)

In [16]:
c.dtype

CategoricalDtype(categories=['apple', 'orange'], ordered=False)