In [68]:
import sys
sys.path.append('C:\Anaconda3\Lib\site-packages') # Link to my local libraries
# import warnings
# warnings.filterwarnings('ignore')
import numpy as np
import os
import glob
import json
import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
%matplotlib inline
sys.version

'3.5.1 |Continuum Analytics, Inc.| (default, Jan 29 2016, 15:01:46) [MSC v.1900 64 bit (AMD64)]'

Author: Alex Galea   
Date: September, 2016

Fuzzywuzzy can be installed with

`pip install fuzzywuzzy`

To easily check if the package is already installed or has installed properly, boot up the python interpreter and try to import it.

## Fuzzywuzzy Tutorial

Description (from the repository readme):

Fuzzy __string matching__ like a boss. It uses __Levenshtein Distance__ to calculate the differences between sequences in a simple-to-use package.

__What is the Levenshtein Distance metric for comparing strings?__

Here is the algorithm:

![](pictures/levenshtein-distance-1.png)

For example we start with:

![](pictures/levenshtein-distance-2.png)

and end with

![](pictures/levenshtein-distance-3.png)

The Levenshtein distance in this example is 2.

### The data

I set up a folder named `source_text` and put a .json file in there containing a bunch of hockey tweets. It is available for download on github along with this notebook.

In [2]:
os.listdir('source_text')

['#nhl_2016-05-07.json']

In this tutorial we will see:
 - simple examples of fuzzy matching scores
 - given a search phrase, find closely matching items from a list of strings

In [3]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



![](modules.png)

### The fuzz module

Ratio VS partial ratio

In [4]:
fuzz.ratio('gumbo', 'gambol')

73

In [5]:
fuzz.partial_ratio('gumbo', 'gambol')

80

In [6]:
fuzz.ratio('gumbo', 'gambol extra')

47

In [7]:
fuzz.partial_ratio('gumbo', 'gambol extra')

80

In [8]:
fuzz.partial_ratio('gumbo', 'gumbo extra')

100

Compare ordered strings

In [9]:
# Each string it sorted before comparing
fuzz.token_sort_ratio('Calgary AB',
                      'ab calgary')

100

In [10]:
fuzz.token_sort_ratio('Calgary AB',
                      'ab calgary canada')

74

In [11]:
fuzz.partial_token_sort_ratio('Calgary AB',
                              'ab calgary canada')

100

In [12]:
fuzz.token_sort_ratio('Calgary AB CA',
                      'ab calgary canada')

67

In [13]:
fuzz.token_sort_ratio('Calgary AB CAN',
                      'ab calgary canada')

90

In [14]:
fuzz.partial_token_sort_ratio('Calgary AB CAN',
                              'ab calgary canada')

100

In [15]:
str_1 = 'Calgary AB CA is cold'
str_2 = 'cold: ab calgary canada'

names = ['QRatio', 'WRatio', 'ratio',
         'token_sort_ratio',
         'partial_token_sort_ratio']

scores = [fuzz.QRatio(str_1, str_2),
          fuzz.WRatio(str_1, str_2),
          fuzz.ratio(str_1, str_2),
          fuzz.token_sort_ratio(str_1, str_2),
          fuzz.partial_token_sort_ratio(str_1, str_2)]

list(zip(names, scores))

[('QRatio', 50),
 ('WRatio', 80),
 ('ratio', 41),
 ('token_sort_ratio', 70),
 ('partial_token_sort_ratio', 67)]

### Processing lists of strings

Load our NHL data

In [85]:
def reduce_file(imod):
    f_new = open('source_text/#nhl-tweets.json', 'w')
    with open('source_text/#nhl_2016-05-07.json', 'r') as f:
        for i, line in enumerate(f.readlines()):
            if i%imod == 0:
                f_new.write(line)
    f_new.close()

def load_tweets(file):
    with open(file, 'r') as f:
        tweets = (json.loads(line) for line in f.readlines())
    return tweets

tweets = load_tweets('source_text/#nhl_2016-05-07.json')

In [86]:
text = [t['text'] for t in tweets]
len(text)

8716

In [87]:
text[-3:]

['#BostonBruins #NHL Boston Bruins 1.5oz. Shot Glass:  $5.99End Date: Sunday Jun-5-2016 16:32:0... https://t.co/pGnAPQTYH0 #Boston #Bruins',
 'Braden Holtby / NHLPA names Kane, Benn and Holtby as finalists for Ted Lindsay Award https://t.co/xGAMqnRRMM #NHL',
 "Matt Carle / Carle is in the lineup for Friday's Game 4 against the Islanders, https://t.co/zTtLfeCgJU #NHL"]

Finding matches to given phrase in a list of items

In [92]:
# List comprehension method
search = ['san jose sharks', 'SAN JOSE SHARKS', 'san jose', 'San Jose', 'sharks']
%time len([t for t in text if sum([s in t for s in search])])

Wall time: 15.6 ms


127

In [57]:
process.extract?

In [94]:
search_phrase = 'san jose sharks'
%time process.extract(search_phrase, text)

Wall time: 16.3 s


[('#NHL #hockey NHL San Jose Sharks 68114091 Plastic Sign, 11 x 17", Black https://t.co/iBPszOpupK https://t.co/YNCYibJwJP',
  90),
 ('San Jose Sharks ‘No Goal’ A Major NHL Problem - https://t.co/kCS7D6mtqD #NHL https://t.co/DLUTmpjZXF',
  90),
 ("San Jose Sharks 'No Goal' A Major NHL Problem - The Hockey Writers https://t.co/63DN2UUfIM #NHL",
  90),
 ("Who will win tonight's game? Nashville Predators or San Jose Sharks? #Preds #SJSharks #SJSvsNSH #NHL",
  90),
 ('Picks Combinado: #MLB San Francisco, #NBA Heat,  #NHL San Jose Sharks, #SerieA Inter.  Todos a ganar',
  90)]

In [104]:
%time SJS_fuzzy = process.extract(search_phrase, text, limit=len(text))

Wall time: 18.3 s


In [96]:
len(SJS_fuzzy), len(text)

(8716, 8716)

How are the scores for each item distributed?

In [None]:
plt.hist([t[1] for t in SJS_fuzzy], bins=30, alpha=0.5)
plt.xlabel('Fuzzy score');

Let's filter out the low scores.

In [98]:
SJS_fuzzy = [t for t in SJS_fuzzy if t[1] > 50]
len(SJS_fuzzy)

267

In [99]:
SJS_fuzzy[-5:]

[('#NHL #Devils 5 Best Exercises For Successful Weight Loss https://t.co/YNkGahROPv #Share',
  51),
 ('I wonder if the keeper of the #NHL Stanley Cup talks to it in their private time?',
  51),
 ('#NHL #Hockey Dallas #Stars Adjustable Snap Back Hat by Reebok https://t.co/MmUMhsO42H #Dallas https://t.co/wexa5drjFw',
  51),
 ('Spezza, Stars settled in for long series https://t.co/rWrgKqW0S6 #nhl', 51),
 ('#NHL #Devils Losing Weight Without Dieting - The Chongqing Way… https://t.co/0pR5KgKyY3 #Share https://t.co/ivrEkCixNS',
  51)]

More filtering is clearly needed

In [None]:
SJS_fuzzy = [t for t in SJS_fuzzy if t[1] > 80]
len(SJS_fuzzy)

In [106]:
SJS_fuzzy[-5:]

[('May 06, 2016 at 09:01PM Support the San Jose Earthquakes #NHL #Soccer team',
  86),
 ('#SanJose San Jose #Sharks Authentic Hooded Sweatshirt https://t.co/qSBFCOiBEA #NHL #Hockey https://t.co/LIjyAkfCcL',
  86),
 ('#SanJose SAN JOSE #Sharks FAN PACK https://t.co/AbmdHyIzPB #NHL #Hockey https://t.co/3Vek3okLVW',
  86),
 ('The #rinnewall is going to San Jose to watch game 5. #letsgopreds beat the #sharks. #Predators #hockey #nhl #playoffs',
  86),
 ('May 06, 2016 at 08:02PM Support the San Jose Earthquakes #NHL #Soccer team',
  86)]

In [113]:
process.dedupe?

In [110]:
# does nothing different ...
# process.extractBests('san jose sharks', text)

[('#NHL #hockey NHL San Jose Sharks 68114091 Plastic Sign, 11 x 17", Black https://t.co/iBPszOpupK https://t.co/YNCYibJwJP',
  90),
 ('San Jose Sharks ‘No Goal’ A Major NHL Problem - https://t.co/kCS7D6mtqD #NHL https://t.co/DLUTmpjZXF',
  90),
 ("San Jose Sharks 'No Goal' A Major NHL Problem - The Hockey Writers https://t.co/63DN2UUfIM #NHL",
  90),
 ("Who will win tonight's game? Nashville Predators or San Jose Sharks? #Preds #SJSharks #SJSvsNSH #NHL",
  90),
 ('Picks Combinado: #MLB San Francisco, #NBA Heat,  #NHL San Jose Sharks, #SerieA Inter.  Todos a ganar',
  90)]

In [111]:
process.extract('san jose sharks', text)

[('#NHL #hockey NHL San Jose Sharks 68114091 Plastic Sign, 11 x 17", Black https://t.co/iBPszOpupK https://t.co/YNCYibJwJP',
  90),
 ('San Jose Sharks ‘No Goal’ A Major NHL Problem - https://t.co/kCS7D6mtqD #NHL https://t.co/DLUTmpjZXF',
  90),
 ("San Jose Sharks 'No Goal' A Major NHL Problem - The Hockey Writers https://t.co/63DN2UUfIM #NHL",
  90),
 ("Who will win tonight's game? Nashville Predators or San Jose Sharks? #Preds #SJSharks #SJSvsNSH #NHL",
  90),
 ('Picks Combinado: #MLB San Francisco, #NBA Heat,  #NHL San Jose Sharks, #SerieA Inter.  Todos a ganar',
  90)]