# Natural Language Processing - Text Preprocessing

## Libraries and settings

In [17]:
# Required
#!pip install nltk==3.7
#!pip install pprint

# Libraries
import os
import re
import string
import numpy as np
import pandas as pd
from pprint import pprint

import nltk
#nltk.download('stopwords')
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.chunk import tree2conlltags
from nltk.chunk import conlltags2tree
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Current working directory
print('Current working directory:', os.getcwd())

# Version abfragen von nltk
import nltk
nltk.__version__

Current working directory: C:\Workspacezhaw\data analytics\Woche 11


'3.7'

### Task 1

In [19]:
# Define your own documents Aufgabe 1b
d1 = 'The base is overtaken and some warriors died in the progress.'
d2 = 'A brave warrior was send on the main quest to win against them.'
d3 = 'the base consists of a village located by the river.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The base is overtaken and some warriors died in the progress. A brave warrior was send on the main quest to win against them. the base consists of a village located by the river.'

In [20]:
# Text to lowercase function (Grund: man möchte möglichst alles einheitlich haben im Text)
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the base is overtaken and some warriors died in the progress. a brave warrior was send on the main quest to win against them. the base consists of a village located by the river.'

In [21]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the base is overtaken and some warriors died in the progress a brave warrior was send on the main quest to win against them the base consists of a village located by the river'

In [22]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'very', 'shan', 'an', 'what', 'myself', 'itself', 'my', 'herself', 'had', "couldn't", 'yourselves', "you'll", "weren't", "you'd", 'here', 'was', 'again', 'to', 'mustn', 'the', "should've", 'then', 'same', 'down', 'before', 'they', 'than', 'we', 'have', 'those', 'am', 'out', 'ain', 'with', 'above', 'hasn', 'ma', 'a', 'his', "doesn't", "mustn't", "isn't", "won't", 'yourself', 'why', 'theirs', 'now', 'other', 're', 'while', 'ourselves', 'on', 'just', 'o', 'our', 'at', 'up', 'as', 'most', 'will', "shan't", 'your', 'them', "hasn't", 'nor', 'so', 'such', 'her', 'aren', 't', 'should', 'only', "wouldn't", 'after', 'and', 'if', "she's", 'doing', 'themselves', 'their', 'been', 'weren', 'because', 'all', 'but', 'y', 'isn', 'both', 'be', 'himself', 'this', 'below', "hadn't", 'do', "you're", 'you', 'through', 'that', 'during', 'hadn', "needn't", 'shouldn', 'does', 'once', 'who', "wasn't", "it's", 'whom', 'from', 'no', 'these', "aren't", 'it', 'its', 'not', 'haven', 'your

In [23]:
# import nltk
# nltk.download('punkt')
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['base', 'overtaken', 'warriors', 'died', 'progress', 'brave', 'warrior', 'send', 'main', 'quest', 'win', 'base', 'consists', 'village', 'located', 'river']

In [24]:
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

Before lemmatization:
['base', 'overtaken', 'warriors', 'died', 'progress', 'brave', 'warrior', 'send', 'main', 'quest', 'win', 'base', 'consists', 'village', 'located', 'river'] 

After lemmatization:
['base', 'overtake', 'warriors', 'die', 'progress', 'brave', 'warrior', 'send', 'main', 'quest', 'win', 'base', 'consist', 'village', 'locate', 'river']

In [26]:
# We will use the lemmatized words above to re-define our corpus 
# durch die Lemmatisierung wurde er Satz auf folgende Hauptwörter eingeschränkt => die Anführungsstriche sollten nur am Anfang/Ende sein (damit es erkennt, dass es ein Satz ist)
corpus = ['base overtake warriors die progress', 
          'brave warrior send main quest win', 
          'base consist village locate river']

In [27]:
# Vectorizer with ngram_range=(1,1) - Unigramm = N-Gramm
vectorizer = CountVectorizer(min_df=0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

#bycicle kommt zweimal vor | drive kommt in allen 3 Dkomenten vor etc.

Document-term matrix
   base  brave  consist  die  locate  main  overtake  progress  quest  river  \
0     1      0        0    1       0     0         1         1      0      0   
1     0      1        0    0       0     1         0         0      1      0   
2     1      0        1    0       1     0         0         0      0      1   

   send  village  warrior  warriors  win  
0     0        0        0         1    0  
1     1        0        1         0    1  
2     0        1        0         0    0  


In [28]:
# Vectorizer with with ngram_range=(2,2) mit 2- hat man zwei folgewörter (B-Gramm)
vectorizer = CountVectorizer(min_df=0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   base consist  base overtake  brave warrior  consist village  die progress  \
0             0              1              0                0             1   
1             0              0              1                0             0   
2             1              0              0                1             0   

   locate river  main quest  overtake warriors  quest win  send main  \
0             0           0                  1          0          0   
1             0           1                  0          1          1   
2             1           0                  0          0          0   

   village locate  warrior send  warriors die  
0               0             0             1  
1               0             1             0  
2               1             0             0  


In [29]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))
 
# TF=> wie oft wöter vorkommen (wörter die häufig vokommen, werden heruntergewichtet, die die weniger vorkommen werden höher gewichtet)
# am höchsten gewichtet wird das wort "bycicle"

Number of words in the corpus: 15 

The words in the corpus: 
 {'village', 'locate', 'river', 'main', 'progress', 'brave', 'warrior', 'die', 'consist', 'win', 'overtake', 'base', 'quest', 'send', 'warriors'}

Term Frequency (TF):
   village  locate  river    main  progress   brave  warrior  die  consist  \
0      0.0     0.0    0.0  0.0000       0.2  0.0000   0.0000  0.2      0.0   
1      0.0     0.0    0.0  0.1667       0.0  0.1667   0.1667  0.0      0.0   
2      0.2     0.2    0.2  0.0000       0.0  0.0000   0.0000  0.0      0.2   

      win  overtake  base   quest    send  warriors  
0  0.0000       0.2   0.2  0.0000  0.0000       0.2  
1  0.1667       0.0   0.0  0.1667  0.1667       0.0  
2  0.0000       0.0   0.2  0.0000  0.0000       0.0  


In [30]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
        village:     0.4771
         locate:     0.4771
          river:     0.4771
           main:     0.4771
       progress:     0.4771
          brave:     0.4771
        warrior:     0.4771
            die:     0.4771
        consist:     0.4771
            win:     0.4771
       overtake:     0.4771
           base:     0.1761
          quest:     0.4771
           send:     0.4771
       warriors:     0.4771


In [31]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   village  locate   river    main  progress   brave  warrior     die  \
0   0.0000  0.0000  0.0000  0.0000    0.0954  0.0000   0.0000  0.0954   
1   0.0000  0.0000  0.0000  0.0795    0.0000  0.0795   0.0795  0.0000   
2   0.0954  0.0954  0.0954  0.0000    0.0000  0.0000   0.0000  0.0000   

   consist     win  overtake    base   quest    send  warriors  
0   0.0000  0.0000    0.0954  0.0352  0.0000  0.0000    0.0954  
1   0.0000  0.0795    0.0000  0.0000  0.0795  0.0795    0.0000  
2   0.0954  0.0000    0.0000  0.0352  0.0000  0.0000    0.0000  


In [32]:
# import nltk
# nltk.download('averaged_perceptron_tagger')

text = '''The storm had felt like a rumor all day, but now, the sky was delivering. 
For a second, like a knife catching a glint of light and refracting it in multitude, everything gleamed white. 
The lightning split the whole sky in half, and in that moment, it was brighter than daylight. 
The tops of the gravestones seemed to pulse like strobe lights in a night club before blackness settled them down again.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)
#idee man sollte nun die Begriffe automatisiert auslesen können => mach tman mit tagging
# RB =>  Adverb. Examples: very, silently,
# RP => Particle. Example: give up
# PRP => Personal Pronoun. Examples: I, he, she
# VBD => Verb, Past Tense. Example: took
# NN => Noun, Singular.

[('The', 'DT', 'B-NP'),
 ('storm', 'NN', 'I-NP'),
 ('had', 'VBD', 'O'),
 ('felt', 'VBN', 'O'),
 ('like', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('rumor', 'NN', 'I-NP'),
 ('all', 'DT', 'B-NP'),
 ('day', 'NN', 'I-NP'),
 (',', ',', 'O'),
 ('but', 'CC', 'O'),
 ('now', 'RB', 'O'),
 (',', ',', 'O'),
 ('the', 'DT', 'B-NP'),
 ('sky', 'NN', 'I-NP'),
 ('was', 'VBD', 'O'),
 ('delivering', 'VBG', 'O'),
 ('.', '.', 'O'),
 ('For', 'IN', 'O'),
 ('a', 'DT', 'O'),
 ('second', 'JJ', 'O'),
 (',', ',', 'O'),
 ('like', 'IN', 'O'),
 ('a', 'DT', 'B-NP'),
 ('knife', 'NN', 'I-NP'),
 ('catching', 'VBG', 'O'),
 ('a', 'DT', 'B-NP'),
 ('glint', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('light', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('refracting', 'VBG', 'O'),
 ('it', 'PRP', 'O'),
 ('in', 'IN', 'O'),
 ('multitude', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('everything', 'NN', 'B-NP'),
 ('gleamed', 'VBD', 'O'),
 ('white', 'JJ', 'O'),
 ('.', '.', 'O'),
 ('The', 'DT', 'B-NP'),
 ('lightning', 'NN', 'I-NP'),
 ('split', 'VBD', 'O'),
 ('t

## Defining documents

In [2]:
# Defining documents (=sentenses)
d1 = 'The car is driven on the road.'
d2 = 'The truck is driven on the highway.'
d3 = 'The bicycle is driven on the bicycle path.'

corpus_01 = d1 + ' ' + d2 + ' ' + d3
corpus_01

'The car is driven on the road. The truck is driven on the highway. The bicycle is driven on the bicycle path.'

## Text preprocessing
#### Steps:
- Text to lowercase
- Removing punctuations
- Tokenization (Aufteilung in einzelne Elemente aus den Sätzen)
- Removal of stop words
- Lemmatization

### Text to lowercase

In [3]:
# Text to lowercase function (Grund: man möchte möglichst alles einheitlich haben im Text)
def text_lowercase(text):
    return text.lower()

# Text to lowercase
corpus_02 = text_lowercase(corpus_01)
corpus_02

'the car is driven on the road. the truck is driven on the highway. the bicycle is driven on the bicycle path.'

### Removing punctuation

In [4]:
# Remove punctuation function
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

# Remove punctuation
corpus_03 = remove_punctuation(corpus_02)
corpus_03

'the car is driven on the road the truck is driven on the highway the bicycle is driven on the bicycle path'

### Tokenize text & removal of stopwords

In [5]:
# Show english stopwords
eng_stopwords = set(stopwords.words('english'))
print("List of english stopwords:")
print(eng_stopwords)

List of english stopwords:
{'very', 'shan', 'an', 'what', 'myself', 'itself', 'my', 'herself', 'had', "couldn't", 'yourselves', "you'll", "weren't", "you'd", 'here', 'was', 'again', 'to', 'mustn', 'the', "should've", 'then', 'same', 'down', 'before', 'they', 'than', 'we', 'have', 'those', 'am', 'out', 'ain', 'with', 'above', 'hasn', 'ma', 'a', 'his', "doesn't", "mustn't", "isn't", "won't", 'yourself', 'why', 'theirs', 'now', 'other', 're', 'while', 'ourselves', 'on', 'just', 'o', 'our', 'at', 'up', 'as', 'most', 'will', "shan't", 'your', 'them', "hasn't", 'nor', 'so', 'such', 'her', 'aren', 't', 'should', 'only', "wouldn't", 'after', 'and', 'if', "she's", 'doing', 'themselves', 'their', 'been', 'weren', 'because', 'all', 'but', 'y', 'isn', 'both', 'be', 'himself', 'this', 'below', "hadn't", 'do', "you're", 'you', 'through', 'that', 'during', 'hadn', "needn't", 'shouldn', 'does', 'once', 'who', "wasn't", "it's", 'whom', 'from', 'no', 'these', "aren't", 'it', 'its', 'not', 'haven', 'your

In [18]:
# import nltk
# nltk.download('punkt')
# Function for tokenization and the removal of stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return filtered_text
 
# Remove stopwords
corpus_04 = remove_stopwords(corpus_03)
print(corpus_04, end="")

['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path']

### Lemmatization

In [7]:
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

# Lemmatize string function
def lemmatize_word(text):
    word_tokens = word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return lemmas

# Lemmatize
lem = []
for i in corpus_04:
    lem.append(lemmatize_word(i))

# Nested list to list
corpus_05 = [' '.join([str(x) for x in lst]) for lst in lem]

print('Before lemmatization:')
print(corpus_04, '\n')

print('After lemmatization:')
print(corpus_05, end="")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\janin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\janin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Before lemmatization:
['car', 'driven', 'road', 'truck', 'driven', 'highway', 'bicycle', 'driven', 'bicycle', 'path'] 

After lemmatization:
['car', 'drive', 'road', 'truck', 'drive', 'highway', 'bicycle', 'drive', 'bicycle', 'path']

## Redefine the text corpus (pre-processed)

In [8]:
# We will use the lemmatized words above to re-define our corpus 
# durch die Lemmatisierung wurde er Satz auf folgende Hauptwörter eingeschränkt => die Anführungsstriche sollten nur am Anfang/Ende sein (damit es erkennt, dass es ein Satz ist)
corpus = ['car drive road', 
          'truck drive highway', 
          'bicycle drive bicycle path']


## Document-term matrix with different ngram_range=(1,1)

In [9]:
# Vectorizer with ngram_range=(1,1) - Unigramm = N-Gramm
vectorizer = CountVectorizer(min_df=0, ngram_range=(1,1))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

#bycicle kommt zweimal vor | drive kommt in allen 3 Dkomenten vor etc.

Document-term matrix
   bicycle  car  drive  highway  path  road  truck
0        0    1      1        0     0     1      0
1        0    0      1        1     0     0      1
2        2    0      1        0     1     0      0


## Document-term matrix with ngram_range=(2,2)

In [10]:
# Vectorizer with with ngram_range=(2,2) mit 2- hat man zwei folgewörter (B-Gramm)
vectorizer = CountVectorizer(min_df=0, ngram_range=(2,2))

# Transform 
count = vectorizer.fit_transform(corpus)
 
# Create dataframe
df_count = pd.DataFrame(count.toarray(),
                        columns=vectorizer.get_feature_names_out())

print('Document-term matrix')
print(df_count)

Document-term matrix
   bicycle drive  bicycle path  car drive  drive bicycle  drive highway  \
0              0             0          1              0              0   
1              0             0          0              0              1   
2              1             1          0              1              0   

   drive road  truck drive  
0           1            0  
1           0            1  
2           0            0  


## Term frequency-inverse document frequency (TF-IDF)
- For details see: https://www.learndatasci.com/glossary/tf-idf-term-frequency-inverse-document-frequency

### Term Frequency (TF)

In [11]:
# Compute Term Frequency (TF)
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))
    
print('Number of words in the corpus:',len(words_set), '\n')
print('The words in the corpus: \n', words_set)

# Number of documents in the corpus
n_docs = len(corpus)

# Number of unique words in the corpus 
n_words_set = len(words_set)

df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), 
                     columns=list(words_set))

print("\nTerm Frequency (TF):")
for i in range(n_docs):
    # Words in the document
    words = corpus[i].split(' ')
    for w in words:
        df_tf[w][i] = df_tf[w][i] + (1 / len(words))
        
print(df_tf.round(4))
 
# TF=> wie oft wöter vorkommen (wörter die häufig vokommen, werden heruntergewichtet, die die weniger vorkommen werden höher gewichtet)
# am höchsten gewichtet wird das wort "bycicle"

Number of words in the corpus: 7 

The words in the corpus: 
 {'highway', 'bicycle', 'road', 'car', 'truck', 'path', 'drive'}

Term Frequency (TF):
   highway  bicycle    road     car   truck  path   drive
0   0.0000      0.0  0.3333  0.3333  0.0000  0.00  0.3333
1   0.3333      0.0  0.0000  0.0000  0.3333  0.00  0.3333
2   0.0000      0.5  0.0000  0.0000  0.0000  0.25  0.2500


### Inverse Document Frequency (IDF)

In [12]:
# Computing Inverse Document Frequency (IDF)
print("\nInverse Document Frequency (IDF):")

idf = {}

for w in words_set:
    
    # k = number of documents that contain this word
    k = 0
    
    for i in range(n_docs):
        if w in corpus[i].split():
            k += 1
            
    idf[w] =  np.log10(n_docs / k).round(4)
    
    print(f'{w:>15}: {idf[w]:>10}')


Inverse Document Frequency (IDF):
        highway:     0.4771
        bicycle:     0.4771
           road:     0.4771
            car:     0.4771
          truck:     0.4771
           path:     0.4771
          drive:        0.0


### Term Frequency - Inverse Document Frequency (TF-IDF)

In [13]:
# Computing TF-IDF
df_tf_idf = df_tf.copy()

for w in words_set:
    for i in range(n_docs):
        df_tf_idf[w][i] = df_tf[w][i] * idf[w]

print('\nTF-IDF:')
print(df_tf_idf.round(4))


TF-IDF:
   highway  bicycle   road    car  truck    path  drive
0    0.000   0.0000  0.159  0.159  0.000  0.0000    0.0
1    0.159   0.0000  0.000  0.000  0.159  0.0000    0.0
2    0.000   0.2386  0.000  0.000  0.000  0.1193    0.0


## Part-of-Speach (POS) tagging
For meaning of POS-tags see: https://pythonexamples.org/nltk-pos-tagging

In [14]:
# import nltk
# nltk.download('averaged_perceptron_tagger')

text = '''European authorities fined Google a record $5.1 
          billion on Wednesday for abusing its power in the 
          mobile phone market.'''

def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

sent = preprocess(text)
pattern = 'NP: {<DT>?<JJ>*<NN>}'

cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)
#idee man sollte nun die Begriffe automatisiert auslesen können => mach tman mit tagging

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('.', '.', 'O')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\janin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [15]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2022-11-30 12:48:03
Python Version: 3.9.7
-----------------------------------
