In [None]:
import nltk

from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


from nltk.tokenize import word_tokenize, wordpunct_tokenize

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

When working with a text you have to take into account families of words e.g. dog, dogs, doggy. The most popular techniques helping to deal with this issue are lemmatization and stemming. Both are special cases of normalization. They identify a canonical representative for a set of related word forms.

The goal of both stemming and lemmatization is to reduce inflectional forms and sometimes derivationally related forms of a word to a common base form.
However, the two words differ in their flavor. Stemming usually refers to a crude heuristic process that chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma.


In [None]:
porter = PorterStemmer()
lancaster = LancasterStemmer()
wordnet = WordNetLemmatizer()

In [None]:
print("{:15}{:15}{:15}{:15}".format("Word","Porter","Lancaster","WordNet"))
for word in ["was", "cats", "played", "mechanical", "friend", "friendship"]:
    print("{:15}{:15}{:15}{}".format(word, porter.stem(word), lancaster.stem(word), wordnet.lemmatize(word, pos="v")))

In [None]:
text = """Poznań University of Technology, PUT (Polish name: Politechnika Poznańska) is a university in Poznań, 
Poland. Poznań University of Technology is known as one of the best technical universities in Poland. URAP 
ranked PUT as in top 6% of world universities and Webometrics ranked it at no. 842 in the world by Google 
citations for the year 2015. In 1995 it became the first Polish university to become a member of the Conference 
of European Schools for Advanced Engineering Education and Research (CESAER), an organization comprising the best 
technical universities in Europe. The university is also a member of the Socrates-Erasmus programme for exchange 
students from all over Europe, promoting advanced engineering and a European dimension. The university is home to 
many organizations and student circles, and the radio station Afera 98.6 MHz. The university has over 
21,000 students and over 1100 academic staffs.""".replace('\n',' ')

In [None]:
porter.stem(text)

As a first step, we have to transform a string into a list of words. It's not as trivial as you might think. Thankfully there are libraries with such functions already implemented.

In [None]:
word_tokenize(text), len(word_tokenize(text))

In [None]:
wordpunct_tokenize(text), len(wordpunct_tokenize(text))

In [None]:
for x in word_tokenize(text):
    print("{:20} - {:20}".format(x, porter.stem(x)))

In [None]:
print(stopwords.words('english'))

## Task
Write function which takes string as input and returns list of stems for reasonable words. Filter out stop words and non-words.

In [None]:
def custom_stemmer(string):
    return

## Task
Convert a date of yyyy-mm-dd format to dd-mm-yyyy format using regular expresions

In [None]:
import re

s = ["1985-12-4",
    "asd 12-132-133",
    "Afs!@#-2055-12-12",
    "02-03-2020",
    "Is it a date 2012-11-01?"]

The following output is expected:

4.12.1985<br>
asd 12-132-133<br>
Afs!@#-12.12.2055<br>
02-03-2020<br>
Is it a date 01.11.2012?<br>

Use re.sub function

In [None]:
for x in s:
    print(re.sub(r'(\d+)', 'number', x))

# OCR

In [None]:
import pytesseract
from PIL import Image
import numpy as np
import cv2
import requests
from io import BytesIO

In [None]:
url = "https://i.ibb.co/8d564bB/example-01.png"

response = requests.get(url)
img = Image.open(BytesIO(response.content))

img

In [None]:
print(pytesseract.image_to_string(img))

In [None]:
url = "https://i.ibb.co/n63mZfb/example-02.png"
response = requests.get(url)
img = Image.open(BytesIO(response.content))
img

In [None]:
print(pytesseract.image_to_string(img))

## Task
Read perfectly text from example_02

In [None]:
response = requests.get("https://i.ibb.co/1ffsFkx/vin.png")
img = Image.open(BytesIO(response.content))
img

In [None]:
print(pytesseract.image_to_string(img))

## Task 5
Read nearly perfectly text from vin.png, 1 error allowed (missing char, additional char, wrong char), 0/O is not an error