### Data

In [1]:
import pandas as pd 
url = 'https://github.com/arunadas/effective-pandas/raw/main/data/vehicles.csv.zip'
df = pd.read_csv(url,dtype='unicode')
city_mpg = df.city08.astype(int)
highway_mpg = df.highway08.astype(int)
make = df.make

In [2]:
make

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [3]:
make.astype('string')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: string

In [4]:
# saves memory and performance 
make.astype('category')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: category
Categories (136, object): ['AM General', 'ASC Incorporated', 'Acura', 'Alfa Romeo', ..., 'Volvo', 'Wallace Environmental', 'Yugo', 'smart']

### .str Accessor 

In [5]:
'Ford'.lower()

'ford'

In [6]:
make.str.lower()

0        alfa romeo
1           ferrari
2             dodge
3             dodge
4            subaru
            ...    
41139        subaru
41140        subaru
41141        subaru
41142        subaru
41143        subaru
Name: make, Length: 41144, dtype: object

In [7]:
'Alfa Romeo'.find('A')

0

In [8]:
make.str.find('A')

0        0
1       -1
2       -1
3       -1
4       -1
        ..
41139   -1
41140   -1
41141   -1
41142   -1
41143   -1
Name: make, Length: 41144, dtype: int64

In [11]:
data = pd.Series(['suz','john','fred','george'],
                 index = [1,2,3,4])

In [12]:
data.str.capitalize()

1       Suz
2      John
3      Fred
4    George
dtype: object

In [13]:
data.str.find('e')

1   -1
2   -1
3    2
4    1
dtype: int64

In [18]:
data.str.extract(r'([a-e])',expand=False)

1    NaN
2    NaN
3      e
4      e
dtype: object

In [19]:
data.str.startswith('f')

1    False
2    False
3     True
4    False
dtype: bool

### Searching

In [20]:
make.str.extract(r'([^a-z A-Z])')

Unnamed: 0,0
0,
1,
2,
3,
4,
...,...
41139,
41140,
41141,
41142,


In [24]:
(make
   .str.extract(r'([^a-z A-Z])', expand = False)
   .value_counts())
# use r'([^0-9.])

make
-    1727
.      46
,       9
Name: count, dtype: int64

### Splitting

In [26]:
age = pd.Series(['0-10', '11-15', '11-15', '61-65', '46-50'])
age

0     0-10
1    11-15
2    11-15
3    61-65
4    46-50
dtype: object

In [27]:
age.str.split('-')

0     [0, 10]
1    [11, 15]
2    [11, 15]
3    [61, 65]
4    [46, 50]
dtype: object

In [30]:
# lower end of the age 
age.str.split('-' , expand = True).iloc[:,0].astype(int)

0     0
1    11
2    11
3    61
4    46
Name: 0, dtype: int64

In [31]:
age.str.slice(-2).astype(int)

0    10
1    15
2    15
3    65
4    50
dtype: int64

In [34]:
age.str[-2:].astype(int)

0    10
1    15
2    15
3    65
4    50
dtype: int64

In [36]:
age.str.split('-', expand = True).astype(int).mean(axis='columns')

0     5.0
1    13.0
2    13.0
3    63.0
4    48.0
dtype: float64

In [37]:
import random 
def between(row):
    return random.randint(*row.values)

age.str.split('-', expand=True).astype(int).apply(between, axis='columns')

0     2
1    15
2    12
3    61
4    50
dtype: int64

### Optimizing apply with cython

In [39]:
%load_ext Cython

In [41]:
%%cython
import random
cpdef int between_cy3(int x , int y):
    return random.randint(x,y)

Content of stderr:
 3964 |                 module = PyImport_ImportModuleLevelObject(
      |                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 3964 |                 module = PyImport_ImportModuleLevelObject(
      |                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [42]:
age.str.split('-', expand=True).astype(int).apply(lambda row: between_cy3(row[0], row[1]), axis=1)

0    10
1    15
2    15
3    63
4    50
dtype: int64

In [43]:
%prun -l 10 (age.str.split('-', expand=True).astype(int).apply(lambda row: between_cy3(row[0], row[1]), axis=1))

 

         1086 function calls (1070 primitive calls) in 0.003 seconds

   Ordered by: internal time
   List reduced from 249 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
      2/1    0.001    0.000    0.001    0.001 series.py:389(__init__)
        1    0.000    0.000    0.003    0.003 {built-in method builtins.exec}
        1    0.000    0.000    0.000    0.000 object_array.py:46(_str_map)
      251    0.000    0.000    0.000    0.000 {built-in method builtins.isinstance}
        1    0.000    0.000    0.001    0.001 accessor.py:255(_wrap_result)
        1    0.000    0.000    0.000    0.000 frame.py:694(__init__)
       10    0.000    0.000    0.000    0.000 series.py:1095(__getitem__)
        4    0.000    0.000    0.000    0.000 construction.py:517(sanitize_array)
        1    0.000    0.000    0.000    0.000 {method 'reduce' of 'numpy.ufunc' objects}
        1    0.000    0.000    0.000    0.000 apply.py:1070(apply_series_gen

In [49]:
%%cython
cimport numpy as np
import numpy as np
import random
cpdef np.ndarray[int] apply_between_cy4(np.ndarray[int] x, np.ndarray[int] y):
    cdef np.ndarray[int] res = np.empty(len(x), dtype='int32') 
    for i in range(len(x)):
       res[i] = random.randint(x[i], y[i])
    return res    

Content of stderr:
In file included from /Users/arunadas/.ipython/cython/_cython_magic_b696d7cb266d93584e388dc56942712c65ff4a27.c:1254:
In file included from /Users/arunadas/Library/Python/3.9/lib/python/site-packages/numpy/_core/include/numpy/arrayobject.h:5:
In file included from /Users/arunadas/Library/Python/3.9/lib/python/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12:
In file included from /Users/arunadas/Library/Python/3.9/lib/python/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1909:
      |  ^
 7713 |                 module = PyImport_ImportModuleLevelObject(
      |                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /Users/arunadas/.ipython/cython/_cython_magic_b696d7cb266d93584e388dc56942712c65ff4a27.c:1254:
In file included from /Users/arunadas/Library/Python/3.9/lib/python/site-packages/numpy/_core/include/numpy/arrayobject.h:5:
In file included from /Users/arunadas/Library/Python/3.9/lib/python/site-packages/numpy/_core/in

In [50]:
(age
  .str.split('-', expand=True)
  .astype(int)
  .pipe(lambda df_ : apply_between_cy4(df_.iloc[:,0].to_numpy(dtype='int32'),
               df_.iloc[:,1].to_numpy(dtype='int32')
                        ))
)

array([ 8, 14, 11, 63, 48], dtype=int32)

### Replacing Text

In [51]:
make.str.replace('A', 'Á')

0        Álfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [52]:
make.replace('A' , 'Á')

0        Alfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [53]:
make.replace({'Audi' : 'Áudi', 'Acura':'Ácura', 'Ashton Martin' : 'Áshton Martin',
              'Alfa Romeo':'Álfa Romeo'})

0        Álfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

In [55]:
#.str.replace for substring , use .replace replace complete string
make.replace('A', 'Á', regex=True)

0        Álfa Romeo
1           Ferrari
2             Dodge
3             Dodge
4            Subaru
            ...    
41139        Subaru
41140        Subaru
41141        Subaru
41142        Subaru
41143        Subaru
Name: make, Length: 41144, dtype: object

### Exercise 11.9

In [60]:
# Using a string column , lowecase the values 
brand = make[:10]
brand.str.lower()

0    alfa romeo
1       ferrari
2         dodge
3         dodge
4        subaru
5        subaru
6        subaru
7        toyota
8        toyota
9        toyota
Name: make, dtype: object

In [65]:
# Using a string column , slice out the first character [start:stop:step]
brand.str.slice(0,1)

0    A
1    F
2    D
3    D
4    S
5    S
6    S
7    T
8    T
9    T
Name: make, dtype: object

In [66]:
# Using a string column , slice out the last three character
brand.str.slice(-3)

0    meo
1    ari
2    dge
3    dge
4    aru
5    aru
6    aru
7    ota
8    ota
9    ota
Name: make, dtype: object

In [76]:
# Using a string column , create a series extracting the numeric value
data = pd.Series(["Product 123", "Item456", "Code 78X", "NoNumber"])
#data.str.extract(r'([0-9]+)', expand=False)
data.str.extract(r'(\d+)', expand=False)

0    123
1    456
2     78
3    NaN
dtype: object

In [78]:
# Using a string column , create a series extracting the non-ASCII value
data = pd.Series(["Hello!", "Café", "123", "你好", "ASCII only"])
# Extract non-ASCII values
data.str.extract(r'([^\x00-\x7F]+)', expand=False)

0    NaN
1      é
2    NaN
3     你好
4    NaN
dtype: object

In [87]:
# Using a string column , create a dataframe with the dummy columns for every character in the column
#brand.str.split(expand=True)

data = pd.Series(["abc", "bcd", "cde"])

# Split each string into characters and create a DataFrame
char_df = data.apply(lambda x: pd.Series(list(x)))

char_df
# Use get_dummies to create dummy columns
dummy_df = pd.get_dummies(char_df, prefix='', prefix_sep='')
dummy_df.astype(int)

Unnamed: 0,a,b,c,b.1,c.1,d,c.2,d.1,e
0,1,0,0,1,0,0,1,0,0
1,0,1,0,0,1,0,0,1,0
2,0,0,1,0,0,1,0,0,1
