### Installation

```py
pip install patterns-finder
```

### Find a pattern in the text

In [1]:
import sys
sys.path.append("../")

In [2]:
from patterns_finder.patterns.web import emoji, url, email

In [3]:
emoji.find("the quick #A52A2A 🦊 jumped 3 times over the lazy 🐶 ")

[(18, 19, 'EMOJI', '🦊'), (49, 50, 'EMOJI', '🐶')]

In [4]:
url.find("The lazy 🐶 has a website https://lazy.dog.com ")

[(25, 45, 'URL', 'https://lazy.dog.com')]

In [5]:
email.find("quick.brown@fox.com is the email of 🦊 ")

[(0, 19, 'EMAIL', 'quick.brown@fox.com')]

### Find multiple patterns in the text

In [6]:
from patterns_finder import finder
from patterns_finder.patterns.web import emoji, url, color_hex
from patterns_finder.patterns.number import integer

In [7]:
patterns = [emoji, color_hex, integer, ("\\b[a-zA-Z]+\\b", "WORD")]                              # Add User-Defined Patterns: "quick|lazy" and ("\\b[a-zA-Z]+\\b", "WORD")
text = "the quick #A52A2A 🦊 jumped 3 times over the lazy 🐶 "
finder.patterns_in_text(text, patterns)

[(18, 19, 'EMOJI', '🦊'),
 (49, 50, 'EMOJI', '🐶'),
 (10, 17, 'COLOR_HEX', '#A52A2A'),
 (12, 14, 'INTEGER', '52'),
 (15, 16, 'INTEGER', '2'),
 (27, 28, 'INTEGER', '3'),
 (0, 3, 'WORD', 'the'),
 (4, 9, 'WORD', 'quick'),
 (20, 26, 'WORD', 'jumped'),
 (29, 34, 'WORD', 'times'),
 (35, 39, 'WORD', 'over'),
 (40, 43, 'WORD', 'the'),
 (44, 48, 'WORD', 'lazy')]

### Sort the results

In [8]:
patterns = [emoji, color_hex, ('\\b[a-zA-Z]+\\b', 'WORD') ]
finder.patterns_in_text(text, patterns, sort_by=finder.START)

[(0, 3, 'WORD', 'the'),
 (4, 9, 'WORD', 'quick'),
 (10, 17, 'COLOR_HEX', '#A52A2A'),
 (18, 19, 'EMOJI', '🦊'),
 (20, 26, 'WORD', 'jumped'),
 (29, 34, 'WORD', 'times'),
 (35, 39, 'WORD', 'over'),
 (40, 43, 'WORD', 'the'),
 (44, 48, 'WORD', 'lazy'),
 (49, 50, 'EMOJI', '🐶')]

### Summarize the results

In [9]:
finder.patterns_in_text(text, patterns, summary_type=finder.LABEL_TEXT)

{'EMOJI': ['🦊', '🐶'],
 'COLOR_HEX': ['#A52A2A'],
 'WORD': ['the', 'quick', 'jumped', 'times', 'over', 'the', 'lazy']}

### Pandas' DataFrame support 

In [10]:
import pandas as pd
pd.set_option("display.max_rows", None, "display.max_columns", None, "display.max_colwidth", None)

df = pd.DataFrame(data={
    'text': ["the quick #A52A2A 🦊 jumped 3 times over the lazy 🐶",
                    "quick.brown@fox.com is the email of 🦊",
                    "The lazy 🐶 has a website https://lazy.dog.com"],
    })
df

Unnamed: 0,text
0,the quick #A52A2A 🦊 jumped 3 times over the lazy 🐶
1,quick.brown@fox.com is the email of 🦊
2,The lazy 🐶 has a website https://lazy.dog.com


In [11]:
patterns = [email, emoji, url]
finder.patterns_in_df(df, "text", "extraction", patterns, summary_type=finder.LABEL_TEXT)

Unnamed: 0,text,extraction
0,the quick #A52A2A 🦊 jumped 3 times over the lazy 🐶,"{'EMOJI': ['🦊', '🐶']}"
1,quick.brown@fox.com is the email of 🦊,"{'EMAIL': ['quick.brown@fox.com'], 'EMOJI': ['🦊']}"
2,The lazy 🐶 has a website https://lazy.dog.com,"{'EMOJI': ['🐶'], 'URL': ['https://lazy.dog.com']}"


In [14]:
text = """
https://www.example.com/questions/3456/my-document ftp://www.example.com/questions/3456/my-document.txt www.example.com
http://example.com/questions/3456/my-document
😃 mailto:test@me.com
/example/questions/3456/my-document.tx  example.com example-com/questions/3456/my-document
عبارة
عنوان
الشبكة
non-matches:
“Uniform”  “Uni\"form” 
123 USD EUR +1 (999) 999-9999
Abcd with 200 Dollars $ 50M text me $50.2m or  $10,000,00 not $10000  yet ¥ 1M but not $ or ¥ or 10
Aenean lacinia bibendum <a href="/life">life</a> sed consectetur. <a href="/work">Work</a> quis 
risus eget urna mol ornare <a href="/about">about</a> leo. <a> me </a>
A1A 1A1 match
A1A-1A1 match
A1A1A1 match
D1D-1D1	no match
AAA-111	no match
A1-A1-A1	no match 0xG10ABG 0x10AB 
Matches     +447222555555   | +44 7222 555 555 | (0722) 5555555 #2222
Non-Matches (+447222)555555 | +44(7222)555555  | (0722) 5555555 #22
"+5.6%","1234.56 %", "±0.05 %" "-42.23%" "+5.6", "0.5", "-100", "%23.6"
100,234,23.10 +10.10 -10.10 ±10
III match MCIV match MMCDXLVI match
IIII	no match Привет мир Россия Москва
IM	no match Matches	-1.23E99 | 1E0 | -9.999e-999
Non-Matches	+10E0 | 2.3e5.4 | 9.4608 x 1015
©2015 © 2015 (c) 2014 match 
(C) 2014 , copyright 2014 match
4000 0000 0000 0000 match
5200-0000-0000-0000 match
6500000000000000 match
3400-0000-0000-0000	no match
340 000000 000000
"matches": ["1:01 AM", "1-01 aM", "23:52:01", "03.24.36 AM"],
"non-matches": ["19:31 AM", "9:9 PM", "25:60:61"]
this is 23/11/20 23/11/2020  a
video lowres.mp4
video..lowres..mp4 벵골인(Bengali people)은 벵골 지역을 
video?lowres.png
video_lowres.mp4 總人口數約300,000,000人。於人口
video_lowres.m4v বাঙ্গালী/বাঙালি 
startbanner.jpg
start banner.jpg
"""

In [15]:
from patterns_finder.patterns.language import arabic, bangali, japanese, russian

patterns = [arabic, bangali, japanese, russian]
finder.patterns_in_text(text,patterns,summary_type=finder.LABEL_TEXT)

{'ARABIC': ['عبارة', 'عنوان', 'الشبكة'],
 'BANGALI': ['বাঙ্গালী', 'বাঙালি'],
 'JAPANESE': ['總人口數約', '人。於人口'],
 'RUSSIAN': ['Привет', 'мир', 'Россия', 'Москва']}