# 12. Python Advanced Modules

## 1. Counter

Dictionary subclass that allows to count the number of ocurrences of each object inside an iterable object!

In [1]:
from collections import Counter

In [2]:
l = [1,11,2,3,4,5,6,3]

In [3]:
Counter(l)

Counter({1: 1, 11: 1, 2: 1, 3: 2, 4: 1, 5: 1, 6: 1})

In [4]:
s = 'dsghfahstugahvkadshjthgsjbvcal'
Counter(s)

Counter({'d': 2,
         's': 4,
         'g': 3,
         'h': 5,
         'f': 1,
         'a': 4,
         't': 2,
         'u': 1,
         'v': 2,
         'k': 1,
         'j': 2,
         'b': 1,
         'c': 1,
         'l': 1})

In [6]:
# How many times does each word show up in sentence?
s = "How many times does each word show up in sentence up up"
Counter(s.split())

Counter({'How': 1,
         'many': 1,
         'times': 1,
         'does': 1,
         'each': 1,
         'word': 1,
         'show': 1,
         'up': 3,
         'in': 1,
         'sentence': 1})

In [7]:
c = Counter(s.split())

In [8]:
c.most_common(2)

[('up', 3), ('How', 1)]

### Common patterns when using the Counter() object

In [None]:
sum(c.values())                   # total count
c.clear()                         # reset counts
list(c), set(c), dict(c)          # Convert to list, set or dict
c.items()                         # Convert to list of (elem, cnt) pair
Counter(dict(list_of_pairs))
c.most_common()[:-n-1:-1]         # n least common elements
c += Counter()                    # Remove zero and negative counts

## 2. defaultdict

Dictionary-like object with all methods provided by dictionary but taking as first argument (default_factory) as default data type.

**A defaultdict will never raise a KeyError. Any non-existing key gets the value returned by default factory**. This is a faster approach than using dict.default_method()

In [15]:
from collections import defaultdict

In [11]:
d = {'k1':1}

In [12]:
d['k1']

1

In [13]:
# Ask for a non-existing key
d['k2']

KeyError: 'k2'

In [16]:
# Using default dictionary
d = defaultdict(object)

In [17]:
d['one']

<object at 0x1004ecbd0>

In [18]:
for item in d:
    print(item)

one


In [22]:
# This allows to always assing 0 if there's
# no value assigned to the key
d = defaultdict(lambda : 0)

In [23]:
d['one']

0

In [24]:
d['two']

0

In [25]:
d

defaultdict(<function __main__.<lambda>()>, {'one': 0, 'two': 0})

## 3. OrderedDict

It's a dictionary subclass that **remembers the order in which its contents are added**

#### NOTE: two normal dictionaries are the same iff they hold the same key-value pairs, but in order for two OrderedDict to be equal, they must retain the same order too.

In [42]:
d = {}
d['a'] = 1
d['b'] = 2
d['c'] = 3
d['d'] = 4
d['e'] = 5

In [43]:
d

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}

In [44]:
for k,v in d.items():
    print (k,v)

a 1
b 2
c 3
d 4
e 5


In [45]:
from collections import OrderedDict

In [46]:
d = OrderedDict()

In [47]:
d['a'] = 1
d['b'] = 2
d['c'] = 3
d['d'] = 4
d['e'] = 5

for k,v in d.items():
    print (k,v)

a 1
b 2
c 3
d 4
e 5


## 4. namedtuple

It's always difficult to remember exactly **which index** refers to a specific element and this may lead to errors. **namedtuple** assigns *names* as well as numerical index to **each member in the tuple**.

In [50]:
t = (1,2,3)

In [51]:
t[0]

1

In [52]:
from collections import namedtuple

With the module **namedtuple** we need to create a simple class by using the following syntax:

```
Class = namedtuple('Class', 'attribute1 attribute2 ... attribute3')
```

In [53]:
Dog = namedtuple('Dog','age breed name')

In [54]:
sam = Dog(age=2,breed='Lab', name='Canela')

In [55]:
sam

Dog(age=2, breed='Lab', name='Canela')

In [56]:
sam.age

2

In [57]:
Cat = namedtuple('Cat','fur claws name')

In [58]:
cat1 = Cat(fur = "Fuzzy", claws = False, name="Luna")

In [60]:
cat1.name

'Luna'

In [61]:
cat1[2]

'Luna'

## 5. Datetime

In [62]:
import datetime

We create a timestamp with the syntax (hours,mins,secs,...)

In [66]:
t = datetime.time(5,25,1)

In [67]:
print(t)

05:25:01


In [68]:
t.hour

5

In [69]:
datetime.time

datetime.time

In [70]:
print(datetime.time.min)

00:00:00


In [71]:
print(datetime.time.max)

23:59:59.999999


In [73]:
print (datetime.time.resolution)

0:00:00.000001


In [74]:
# Now let's take a look to dates

In [75]:
today = datetime.date.today()

In [76]:
print(today)

2019-02-04


In [78]:
today.timetuple()

time.struct_time(tm_year=2019, tm_mon=2, tm_mday=4, tm_hour=0, tm_min=0, tm_sec=0, tm_wday=0, tm_yday=35, tm_isdst=-1)

In [79]:
d1 = datetime.date(2015,3,11)

In [80]:
print(d1)

2015-03-11


In [81]:
d2 = d1.replace(year=1990)

In [82]:
print(d2)

1990-03-11


In [83]:
print(d1-d2)

9131 days, 0:00:00


# 6. Python Debugger

In [84]:
# Python Debugger
import pdb

In [90]:
x = [1,2,3]
y = 2
z = 3

result = y+z
print (result)

# Sets breakpoints to identify errors!
# Hint: character 'q' quits the PDB
pdb.set_trace()

result2 = y+x
print (result2)

5
--Return--
> <ipython-input-90-356b7694a757>(10)<module>()->None
-> pdb.set_trace()
(Pdb) x
[1, 2, 3]
(Pdb) y+z
5
(Pdb) y+x
*** TypeError: unsupported operand type(s) for +: 'int' and 'list'
(Pdb) x**2
*** TypeError: unsupported operand type(s) for ** or pow(): 'list' and 'int'
(Pdb) x*2
[1, 2, 3, 1, 2, 3]
(Pdb) q


BdbQuit: 

## 7. Time your code: timeit

In [91]:
import timeit

In [95]:
"-".join(str(n) for n in range(100))

'0-1-2-3-4-5-6-7-8-9-10-11-12-13-14-15-16-17-18-19-20-21-22-23-24-25-26-27-28-29-30-31-32-33-34-35-36-37-38-39-40-41-42-43-44-45-46-47-48-49-50-51-52-53-54-55-56-57-58-59-60-61-62-63-64-65-66-67-68-69-70-71-72-73-74-75-76-77-78-79-80-81-82-83-84-85-86-87-88-89-90-91-92-93-94-95-96-97-98-99'

We pass the expression to be timed as a string, where we also specify the number of times we want to execute it. Syntax is the following:

In [99]:
timeit.timeit('"-".join([str(n) for n in range(100)])',number=10000)

0.3427601230032451

In [100]:
timeit.timeit('"-".join(map(str,range(100)))',number = 10000)

0.3053589759983879

In [101]:
# Magic function
%timeit "-".join([str(n) for n in range(100)])

32 µs ± 6.01 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


# 8. Regular expression: re

Text-matching patterns and they describe formal syntax. Used for finding repetitions, matching text, etc.

In [102]:
import re

In [103]:
patterns = ['term1', 'term2']

In [104]:
text = 'This is a string with term1, but not the other term'

In [105]:
re.search('hello','hello world!')

<_sre.SRE_Match object; span=(0, 5), match='hello'>

In [108]:
for pattern in patterns:
    
    print (f'Searching for {pattern} in {text}')
    
    # Check for match
    if re.search(pattern, text):
        print ('\nMatch was found\n')
    else:
        print('\nNo match was found')

Searching for term1 in This is a string with term1, but not the other term

Match was found

Searching for term2 in This is a string with term1, but not the other term

No match was found


In [109]:
match = re.search(patterns[0],text)

In [110]:
type(match)

_sre.SRE_Match

In [112]:
# Index where the match starts
match.start()

22

In [113]:
match.end()

27

#### Now let's see how to split regular expression terms (like when we used string method split)

In [116]:
split_term = '@'
phrase = 'What is your email, it is hello@gmail.com?'

In [117]:
re.split(split_term,phrase)

['What is your email, it is hello', 'gmail.com?']

In [118]:
# Returns a list of all the matches
re.findall('match','Here is one match, here is another match, and the the last match')

['match', 'match', 'match']

Now let's learn how to use **metacharacters** in order to create our own patterns.

In [119]:
def multi_re_find(patterns,phrase):
    '''
    Takes in a list of regex patterns
    Prints a list of all matches
    '''
    for pattern in patterns:
        print('Searching the phrase using the re check: %r' %(pattern))
        print(re.findall(pattern,phrase))
        print('\n')

### Repetition Syntax

There are five ways to express repetition in a pattern:

   1. A pattern followed by the meta-character <code>*</code> is repeated zero or more times. 
   2. Replace the <code>*</code> with <code>+</code> and the pattern must appear at least once. 
   3. Using <code>?</code> means the pattern appears zero or one time. 
   4. For a specific number of occurrences, use <code>{m}</code> after the pattern, where **m** is replaced with the number of times the pattern should repeat. 
   5. Use <code>{m,n}</code> where **m** is the minimum number of repetitions and **n** is the maximum. Leaving out **n** <code>{m,}</code> means the value appears at least **m** times, with no maximum.
    
Now we will see an example of each of these using our multi_re_find function:

In [120]:
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = [ 'sd*',     # s followed by zero or more d's
                'sd+',          # s followed by one or more d's
                'sd?',          # s followed by zero or one d's
                'sd{3}',        # s followed by three d's
                'sd{2,3}',      # s followed by two to three d's
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: 'sd*'
['sd', 'sd', 's', 's', 'sddd', 'sddd', 'sddd', 'sd', 's', 's', 's', 's', 's', 's', 'sdddd']


Searching the phrase using the re check: 'sd+'
['sd', 'sd', 'sddd', 'sddd', 'sddd', 'sd', 'sdddd']


Searching the phrase using the re check: 'sd?'
['sd', 'sd', 's', 's', 'sd', 'sd', 'sd', 'sd', 's', 's', 's', 's', 's', 's', 'sd']


Searching the phrase using the re check: 'sd{3}'
['sddd', 'sddd', 'sddd', 'sddd']


Searching the phrase using the re check: 'sd{2,3}'
['sddd', 'sddd', 'sddd', 'sddd']




### Character Sets

Character sets are used when you wish to **match any one of a group of characters** at a point in the input. Brackets are used to construct character set inputs. For example: the input ```[ab]``` searches for occurrences of either **a** or **b**.

In [121]:
test_phrase = 'sdsd..sssddd...sdddsddd...dsds...dsssss...sdddd'

test_patterns = ['[sd]',    # either s or d
                's[sd]+']   # s followed by one or more s or d

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '[sd]'
['s', 'd', 's', 'd', 's', 's', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 's', 'd', 'd', 'd', 'd', 's', 'd', 's', 'd', 's', 's', 's', 's', 's', 's', 'd', 'd', 'd', 'd']


Searching the phrase using the re check: 's[sd]+'
['sdsd', 'sssddd', 'sdddsddd', 'sds', 'sssss', 'sdddd']




### Exclusion

Using char ```^``` to exclude some terms when adding them into the ```[]```representation.

In [122]:
test_phrase = 'This is a string! But it has punctuation. How can we remove it?'

In [124]:
re.findall('[^!.? ]+',test_phrase)

['This',
 'is',
 'a',
 'string',
 'But',
 'it',
 'has',
 'punctuation',
 'How',
 'can',
 'we',
 'remove',
 'it']

### Character ranges

Define a char set fo include all of the **contiguous chars** between a start and stop point.

In [125]:
test_phrase = 'This is an example sentence. Lets see if we can find some letters.'

test_patterns=['[a-z]+',      # sequences of lower case letters
               '[A-Z]+',      # sequences of upper case letters
               '[a-zA-Z]+',   # sequences of lower or upper case letters
               '[A-Z][a-z]+'] # one upper case letter followed by lower case letters
                
multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '[a-z]+'
['his', 'is', 'an', 'example', 'sentence', 'ets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z]+'
['T', 'L']


Searching the phrase using the re check: '[a-zA-Z]+'
['This', 'is', 'an', 'example', 'sentence', 'Lets', 'see', 'if', 'we', 'can', 'find', 'some', 'letters']


Searching the phrase using the re check: '[A-Z][a-z]+'
['This', 'Lets']




### Special escape codes:

You can use special escape codes to find specific types of patterns in your data, such as digits, non-digits, whitespace, and more. For example:

<table border="1" class="docutils">
<colgroup>
<col width="14%" />
<col width="86%" />
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Code</th>
<th class="head">Meaning</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\d</span></tt></td>
<td>a digit</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\D</span></tt></td>
<td>a non-digit</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\s</span></tt></td>
<td>whitespace (tab, space, newline, etc.)</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\S</span></tt></td>
<td>non-whitespace</td>
</tr>
<tr class="row-even"><td><tt class="docutils literal"><span class="pre">\w</span></tt></td>
<td>alphanumeric</td>
</tr>
<tr class="row-odd"><td><tt class="docutils literal"><span class="pre">\W</span></tt></td>
<td>non-alphanumeric</td>
</tr>
</tbody>
</table>

In [126]:
test_phrase = 'This is a string with some numbers 1233 and a symbol #hashtag'

test_patterns=[ r'\d+', # sequence of digits
                r'\D+', # sequence of non-digits
                r'\s+', # sequence of whitespace
                r'\S+', # sequence of non-whitespace
                r'\w+', # alphanumeric characters
                r'\W+', # non-alphanumeric
                ]

multi_re_find(test_patterns,test_phrase)

Searching the phrase using the re check: '\\d+'
['1233']


Searching the phrase using the re check: '\\D+'
['This is a string with some numbers ', ' and a symbol #hashtag']


Searching the phrase using the re check: '\\s+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']


Searching the phrase using the re check: '\\S+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', '#hashtag']


Searching the phrase using the re check: '\\w+'
['This', 'is', 'a', 'string', 'with', 'some', 'numbers', '1233', 'and', 'a', 'symbol', 'hashtag']


Searching the phrase using the re check: '\\W+'
[' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' #']




## 9. StringIO

Implements an in-memory file like object

In [1]:
import StringIO

ModuleNotFoundError: No module named 'StringIO'