# Tokenisation

#### Different examples of tokenisation in Python.

#### First, we need to import the nltk libraries and regular expression features

In [1]:
# first install the required packages
!pip3 install nltk
!pip3 install scipy
!pip3 install numpy
!pip3 install pycountry
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('crubadan')

import re

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 2.5 MB/s eta 0:00:00
Collecting joblib
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
     -------------------------------------- 301.8/301.8 KB 2.7 MB/s eta 0:00:00
Installing collected packages: joblib, nltk
Successfully installed joblib-1.4.2 nltk-3.9.1


You should consider upgrading via the 'C:\Users\cotsi\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting scipy
  Downloading scipy-1.14.1-cp310-cp310-win_amd64.whl (44.8 MB)
     ---------------------------------------- 44.8/44.8 MB 2.9 MB/s eta 0:00:00
Installing collected packages: scipy
Successfully installed scipy-1.14.1


You should consider upgrading via the 'C:\Users\cotsi\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'C:\Users\cotsi\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
     ---------------------------------------- 6.3/6.3 MB 33.6 MB/s eta 0:00:00
Installing collected packages: pycountry
Successfully installed pycountry-24.6.1


You should consider upgrading via the 'C:\Users\cotsi\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cotsi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cotsi\AppData\Roaming\nltk_data...
[nltk_data] Downloading package crubadan to
[nltk_data]     C:\Users\cotsi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\crubadan.zip.


#### Next, let's create some sample data to play with...

#### I used used a couple of lines of text from a BBC sports page as an example.

In [5]:
text = 'The European Super League (ESL) is on "standby" despite nine of the 12 founding ' \
       'teams withdrawing, says Real Madrid president Florentino Perez.  After a furious '\
       'backlash against the proposed tournament that was announced on Sunday, all six '\
       'Premier League clubs involved withdrew on Tuesday.'

#### Now, we can tokenise by word.

In [9]:
word_tokens = nltk.tokenize.word_tokenize(text)

print(word_tokens)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\User/nltk_data'
    - 'c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python312\\nltk_data'
    - 'c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python312\\share\\nltk_data'
    - 'c:\\Users\\User\\AppData\\Local\\Programs\\Python\\Python312\\lib\\nltk_data'
    - 'C:\\Users\\User\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


#### You will notice in the list above it treats punctuation as individual tokens.  It's easy enough to strip this out...

In [None]:
test = [word for word in word_tokens if word.isalpha()]
print(test)

#### Well, the punctuation is gone but notice the 12 disappeared?  That's because it's not an alpha character.  Let's try again.


In [None]:
word_tokens = [word for word in word_tokens if word.isalpha() or word.isnumeric()]
print(word_tokens)

#### Another quick and dirty way to tokenise is just to split on whitespace...

In [None]:
word_tokens2 = text.split()
print(word_tokens2)

You'll notice that the punctuation is now with the individual terms, we can still strip it out 

In [None]:
test = [word for word in word_tokens2 if word.isalpha()]
print(test)

#### Ooops, that seems to have gotten rid of any text containing punctuation and the numerical value, let's try again. 

#### We will use a regular expression so that we end up just accepting words from the list.

#### Note, the 2nd parameter says "" will be substituted if it does satisfy the regular expression.  You'll need to use this with care with some tokenisation as you may have punctuation on its own or another anomolies that will turn into "" entries in your list (which is easy enough to strip out at any rate).

In [None]:
word_tokens2 = [re.sub('[^\w]', "", word) for word in word_tokens2]
print(word_tokens2)

#### That seems to have done the trick

#### You will see below that the two tokenised versions are now the same, just 2 different ways of doing it.

In [None]:
print(word_tokens)
print(word_tokens2)

#### The next step will be to lower case the terms...

In [None]:
word_tokens = [word.lower() for word in word_tokens]
print(word_tokens)

#### You will also undoubtably come accross other cases, like contractions or other odditites

In [None]:
text = "O'Niell can't run."
text = nltk.word_tokenize(text)
print(text)

#### So, O'Niell comes out fine, but the contraction has been split up?  This may or may not be useful.

#### Some NLP tools can deal with that type of input.  I generally avoid it which is one reason why splitting on space, in some cases, can make things easier

In [None]:
text = "O'Niell can't run."
text = text.split()
print(text)
text = [re.sub('[^\w]', "", word) for word in text]
print(text)


# Stemming and Lemmatisation

#### Let's run through some of the examples to see what happens if we stem/lemmatise them

#### First, we need to import some additional nltk libraries to work with this.

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

#### Let's look at the cases with punctuation we introduced above as well as some other "gotcha" cases

In [None]:
porter_stemmer = PorterStemmer()
print(porter_stemmer.stem("O'Niell"))
print(porter_stemmer.stem("can't"))
print(porter_stemmer.stem("cant'"))
print(porter_stemmer.stem("hers'"))
print(porter_stemmer.stem("hers"))
print(porter_stemmer.stem("university"))
print(porter_stemmer.stem("universe"))

#### Let's stem the longer text we tokenized earlier

In [None]:
print(word_tokens)
stemmed_text = [porter_stemmer.stem(word) for word in word_tokens]
print(stemmed_text)

#### Now let's apply lemmatisation.  

#### Remember, we need to the part of speech tag for this to work properly so let's get that first

In [None]:
POSTags = nltk.pos_tag(word_tokens)
print(POSTags)

#### If we needed to look it up, we can actually print out the meanings of the POS if you don't know them

In [None]:
for t in POSTags:
  nltk.help.upenn_tagset(t[1])


#### We will need to make a translation function in order to use our POS in the WordNet Lemmatiser as it uses a different set (or subset) of POS tags.

In [None]:
def get_wordnet_post(word):
  # Remember, the word is a tuple, word[0] = word, word[1] = POS Tag
  tag = word[1][0].upper()
  tag_dictionary = { "J": wordnet.ADJ,
	                 "N": wordnet.NOUN,
	                 "V": wordnet.VERB,
	                 "R": wordnet.ADV}
	
  # retrive value from dictionary, if not found use default of NOUN
  return tag_dictionary.get(tag, wordnet.NOUN)

#### Now we can lemmatise our text...

In [None]:
lemmatiser = WordNetLemmatizer()
print("Lemmatisation of the sentence: ")
for t in POSTags:
  term = t[0]
  print("[" + term + "]:  " + lemmatiser.lemmatize(term, pos = get_wordnet_post(t)) + \
        " which is a " + get_wordnet_post(t))

#### So, using a lemmatiser is somewhat more work - in practice I find results between stemming/lemmatisation are usually pretty similar, not much to choose between them

#### Next, an example of what happens if you *don't* use a POS tag

In [None]:
print("better :", lemmatiser.lemmatize("better", pos ="a"))
print("better :", lemmatiser.lemmatize("better"))

# Language Detection

#### To cap things off, we will look at a model that guesses the language of text.  

#### First, let's come up with some sample pieces of text

In [None]:
import pycountry

#English
phrase_one = "good morning"
# Afrikaans
phrase_two = "goeie more"
# Italian
phrase_three = "buongiorno"
# Korean
phrase_four = "좋은 아침"

#### Now, we will instantiate a text classificaiton model and see what we come up with

In [None]:
tc = nltk.classify.textcat.TextCat() 
guess_one = tc.guess_language(phrase_one)
guess_two = tc.guess_language(phrase_two)
guess_three = tc.guess_language(phrase_three)
guess_four = tc.guess_language(phrase_four)

print(guess_one)
print(guess_two)
print(guess_three)
print(guess_four)

guess_one_name = pycountry.languages.get(alpha_3=guess_one).name
guess_two_name = pycountry.languages.get(alpha_3=guess_two).name
guess_three_name = pycountry.languages.get(alpha_3=guess_three).name
guess_four_name = pycountry.languages.get(alpha_3=guess_four).name
print(guess_one_name)
print(guess_two_name)
print(guess_three_name)
print(guess_four_name)