# Spacy SSR Model
### Goal:
This notebook creates Custom Named Entity Recognition model for Service Shop Reports.\
List of defined Tags:
- SNs
- Report Issue Date
- Repair Job Number
- Report Type (Final/Preliminary)

### Approach Description:
To better understand this notebook please refer to spacy documentation and check what are the patterns for implementing NERs


In [1]:
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy import displacy
import pandas as pd
import json

# from spacy.language import Language
# from spacy_langdetect import LanguageDetector

## 1. Turbine Serial Numbers
- to create tags for SNs we need a list of SNs that we want to implement. It can be taken from FleetMaster Extract.
- example file 'sn_eq_oem_gas_turbines.csv;

In [2]:
# sns_eq_oem = pd.read_csv('sns_eq_oem_full.csv', encoding = "utf-8", sep = ';', on_bad_lines='skip')      # Full 150K
sns_eq_oem = pd.read_csv('sn_eq_oem_gas_turbines.csv', encoding = "utf-8", sep = ';', on_bad_lines='skip')            # Filter GT
sns_eq_oem = sns_eq_oem.dropna(how='all').reset_index()
sns_eq_oem

Unnamed: 0,index,E_Equipment Serial Number,E_OEM Nameplate SN
0,0,G06655,G06655
1,1,299688,299688
2,2,299687,299687
3,3,299686,299686
4,4,299685,299685
...,...,...,...
28682,28682,807372,807372UK
28683,28683,807364,807364UK
28684,28684,807363,807363UK
28685,28685,820035,7216589


In [3]:
sns_eq_oem = sns_eq_oem[['E_Equipment Serial Number', 'E_OEM Nameplate SN']]

In [4]:
sns_oem_full = sns_eq_oem['E_OEM Nameplate SN'].dropna().str.lower().tolist()
len(sns_oem_full)

28625

In [7]:
sns_eq_full = sns_eq_oem['E_Equipment Serial Number'].dropna().str.lower().tolist()
len(sns_eq_full)

28687

In [8]:
def create_training_data(tag_list, tag_name):
    '''
    This function creates list of patterns of tags, that can be implemented into spacy NER model.
    
    Args:
        tag list - listt of tags (for example list of SNs)
        
    Returns:
        list of patterns for NER model. Pattern are in the spacy NER format {}
    
    '''
    data = tag_list
    patterns = []
    for item in data:
        pattern = {
            'label' : tag_name,
            'pattern': item
        }
        patterns.append(pattern)
    return patterns

In [9]:
# full extract from fleetmaster
patterns_sns_oem = create_training_data(sns_oem_full, "SN_OEM")
patterns_sns_eq = create_training_data(sns_eq_full, "SN_EQ")

## 2. Dates
- dates patterns are defines using spacy standard

In [10]:
patterns_date = [{'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_ALPHA': True},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4}]},
  
#to ma blokowac 125-28-10, wczesniej bylo bez length
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': {'==':[2, 4]}},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 2, 'TEXT': {'REGEX': '[012].'}}]},
                 
 
    #5-July-2020             
    {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': {'<=': 2}},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'TEXT': {'REGEX': '[a-z]+[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},
                 
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 1},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 2, 'TEXT': {'REGEX': '[012].'}}]},
 {'label': 'DATE',
  'pattern': [{'IS_ALPHA': True},
   {'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'TEXT': {'REGEX': '[a-z]+-[0-9]{4}'}}]},
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9]{2}[.-/][0-9]{2}[-./]2[0-9]{3}'}}]},
{'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9][.-/][0-9]{1,2}[-./]2[0-9]{3}'}}]},
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9]{2}[.-/][0-9]{2}[-./][012][0-9]'}}]},
                {'label': 'DATE',
#   'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
      'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'TEXT': {'REGEX': '[a-z]+[-][012][0-9]'}}]},
                {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},
     {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},                
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX':'[0-9][0-9][a-z]{2}'}},
      {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},
{'label': 'DATE',
'pattern': [{'IS_ALPHA': True},
            {'TEXT': {'REGEX':'[0-9][0-9]?[a-z]{2}'}},
            {'IS_PUNCT': True},
            {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]},
                 
                 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 1},
   {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '2[0-9]{3}'}}]}
           ]

In [11]:
patterns_date = [{'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_ALPHA': True},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4}]},
  
#to ma blokowac 125-28-10, wczesniej bylo bez length
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': {'==':[2, 4]}},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 2, 'TEXT': {'REGEX': '[012].'}}]},
                 
 
    #5-July-2020             
    {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': {'<=': 2}},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'TEXT': {'REGEX': '[a-z]+[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},
                 
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 1},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 2, 'TEXT': {'REGEX': '[012].'}}]},
 {'label': 'DATE',
  'pattern': [{'IS_ALPHA': True},
   {'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},
 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'TEXT': {'REGEX': '[a-z]+-[0-9]{4}'}}]},
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9]{2}[.-/][0-9]{2}[-./]2[0-9]{3}'}}]},
{'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9][.-/][0-9]{1,2}[-./]2[0-9]{3}'}}]},
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX': '[0-9]{2}[.-/][0-9]{2}[-./][012][0-9]'}}]},
                {'label': 'DATE',
#   'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
      'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True},
   {'TEXT': {'REGEX': '[a-z]+[-][012][0-9]'}}]},
                {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},
     {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True},
   {'IS_PUNCT': True, 'TEXT': {'REGEX': '[-/]'}},
   {'IS_DIGIT': True, 'LENGTH': 2},
   {'IS_PUNCT': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},                
 {'label': 'DATE',
  'pattern': [{'TEXT': {'REGEX':'[0-9][0-9][a-z]{2}'}},
      {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},
{'label': 'DATE',
'pattern': [{'IS_ALPHA': True},
            {'TEXT': {'REGEX':'[0-9][0-9]?[a-z]{2}'}},
            {'IS_PUNCT': True},
            {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]},
                 
                 {'label': 'DATE',
  'pattern': [{'IS_DIGIT': True, 'LENGTH': 1},
   {'IS_ALPHA': True},
   {'IS_DIGIT': True, 'LENGTH': 4, 'TEXT': {'REGEX': '20[0-9]{2}'}}]}
           ]

## 3. Job Numbers

In [12]:
job_patterns = [{'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                           {'IS_DIGIT': True}]}
               
               ]

In [13]:
job_patterns_regex = [{'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                      
                      
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'job'}},
                           {'IS_ALPHA': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]}                      
                      
               
               ]

In [14]:
#Italian job nr:
# Commesa

commesa_patterns_regex = [{'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_ALPHA': True},
                           {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]},
                      
                      
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
               {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_ALPHA': True},
                            {'IS_ALPHA': True, "LENGTH": {"<=": 3}},
                           {'IS_DIGIT': True}]},
                {'label': 'JOB #',
                'pattern': [{'TEXT': {'REGEX': 'comme?s?s?a?'}},
                           {'IS_ALPHA': True},
                            {'IS_PUNCT': True},
                            {'IS_PUNCT': True},
                            {'TEXT': {'REGEX': '[a-z]{0,4}[0-9/-]+'}}]}
                              
               ]

## 4. Report Type
- Report Type: Final
- Report Type: INSPECTION
- Final Repair Report
- PRELIMINARY REPORT
Condition Report
Final report

- REPORT FINALE
- REPORT PRELIMINARE

In [15]:
report_patterns = [{'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'report'}},
                           {'TEXT': {'REGEX': 'type'}},
                            {'IS_PUNCT': True, 'OP': '?'},
                           {'TEXT': {'REGEX': 'final'}}]},
                   {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'report'}},
                           {'TEXT': {'REGEX': 'type'}},
                            {'IS_PUNCT': True, 'OP': '?'},
                           {'TEXT': {'REGEX': 'inspection'}}]},
                {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'final'}},
                           {'TEXT': {'REGEX': 'repair'}},
                           {'TEXT': {'REGEX': 'report'}}]},
                {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'preliminary'}},
                           {'TEXT': {'REGEX': 'report'}}]},
                {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'condition'}},
                           {'TEXT': {'REGEX': 'report'}}]},
                {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'final'}},
                           {'TEXT': {'REGEX': 'report'}}]},
                                      
                   
                   
                   #IT
                                   {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'report'}},
                           {'TEXT': {'REGEX': 'finale'}}]},
                                   {'label': 'REPORT',
                'pattern': [{'TEXT': {'REGEX': 'report'}},
                           {'TEXT': {'REGEX': 'preliminare'}}]},
                  
                  
                  ]

## 2*. Transforming Dates into Report Dates
- in point 2 we defined date patterns, no we want to add extra words to narrow our result only for dates that matters.
- Issuing Date: 10-02-12
- Report Date: 20 Dec 2017
- Report Emission Date:
		Po wlosku data
Data: 18/05/2016!

words: issuing, report, emission

In [18]:
def generate_report_dates(date_patterns, extra_word, new_label):
    '''
    This function creates list of patterns of tags for report dates, that can be implemented into spacy NER model.
    
    Args:
        date patterns - list of date patterns created before
        extra_word - like 'issuing', 'emission'
        new_label - new NER tag/label
        
    Returns:
        updated list of patterns for NER model. Pattern are in the spacy NER format {}
    
    '''
    
    new_patterns = []
    
    # from 20-apr-2012   - bez ':'
    for pattern_date in date_patterns:
        new_pattern = {
            'label': new_label,
            'pattern': [{'TEXT': {'REGEX': extra_word}}] + [{'LOWER': 'date'}] + pattern_date['pattern'] 
        }
        new_patterns.append(new_pattern)
    
    # from: 20-apr-2012    - z ':'
    for pattern_date in date_patterns:
        new_pattern = {
            'label': new_label,
            'pattern': [{'TEXT': {'REGEX': extra_word}}] + [{'LOWER': 'date'}] + [{'IS_PUNCT': True}] + pattern_date['pattern'] 
        }
        new_patterns.append(new_pattern)
    
    return new_patterns 

In [19]:
words = ['issuing', 'report', 'emission']
report_dates = []
for word in words:
    report_dates += generate_report_dates(patterns_date, word, 'REP_DATE')

In [20]:
def generate_report_dates_2(date_patterns, extra_word, new_label):
    '''
    Similar to generate_report_dates, but in generate_report_dates we have <<extra word>> + 'date' + ...
    Here our extra word can be 'date' or 'data'
    '''
    
    new_patterns = []
    
    # from 20-apr-2012
    for pattern_date in date_patterns:
        new_pattern = {
            'label': new_label,
            'pattern': [{'LOWER': extra_word}] + pattern_date['pattern'] 
        }
        new_patterns.append(new_pattern)
    
    # from: 20-apr-2012
    for pattern_date in date_patterns:
        new_pattern = {
            'label': new_label,
            'pattern': [{'LOWER': extra_word}] + [{'IS_PUNCT': True}] + pattern_date['pattern'] 
        }
        new_patterns.append(new_pattern)
    
    return new_patterns 

In [21]:
words_2 = ['date', 'data']
report_dates_2 = []
for word in words_2:
    report_dates_2 += generate_report_dates_2(patterns_date, word, 'REP_DATE_2')

## 3*. Updating and merging all job patterns
- we want to get rid of sales job numbers

In [22]:
def add_forbidden_word(patterns, forbidden_word, new_label):
    '''This funtion updates job patters by adding forbidden words.
    
    Args:
        patterns - list of already created patterns
        forbidden_word - word, that will be add in the begging of pattern as forbidden
        new label
        
    Returns:
        new_patterns - updated job patterns
    '''
    new_patterns = []
    
    for pattern in patterns:
        new_pattern = {
            'label': new_label,
            'pattern': [{'LOWER': forbidden_word, 'OP': '!'}] + pattern['pattern']
        }
        new_patterns.append(new_pattern)
    
    return new_patterns

In [24]:
job_updated_patterns = add_forbidden_word(job_patterns_regex, 'sales', 'JOB #')
job_patterns = commesa_patterns_regex + job_updated_patterns

## Create and save model

In [25]:
def generate_rules_full(patterns_list):
    '''This function creates spacy model and save it into your working directory.
    
    Args:
        patterns_list - predefined list of spacy-format patterns
        
    Returns:
        Creates and saved spacy NER model into your working directory.
    '''
    nlp = English()
    ruler = nlp.add_pipe('entity_ruler')
    for patterns in patterns_list:
        ruler.add_patterns(patterns)
    nlp.to_disk('11_05_ssr_ner')  # Put the name of the model here

In [26]:
# 11_05_ssr_ner
generate_rules_full([report_dates, report_dates_2, patterns_date, patterns_sns_oem, patterns_sns_eq, job_patterns, report_patterns])