<a href="https://colab.research.google.com/github/arjunjanamatti/learn_and_practise_spacy/blob/master/sparkNLP_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os

# Install java
! apt-get update -qq
! apt-get install -y openjdk-8-jdk-headless -qq > /dev/null

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["PATH"] = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]
! java -version

# Install pyspark
! pip install --ignore-installed -q pyspark==2.4.4
! pip install --ignore-installed -q spark-nlp==2.5.4


openjdk version "1.8.0_265"
OpenJDK Runtime Environment (build 1.8.0_265-8u265-b01-0ubuntu2~18.04-b01)
OpenJDK 64-Bit Server VM (build 25.265-b01, mixed mode)
[K     |████████████████████████████████| 215.7MB 70kB/s 
[K     |████████████████████████████████| 204kB 54.4MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 133kB 3.4MB/s 
[?25h

### Start Sparknlp session

In [3]:
import sparknlp

spark = sparknlp.start()

print('Spark NLP version', sparknlp.version())
print()
print('Apache spark version', spark.version)

Spark NLP version 2.5.4

Apache spark version 2.4.4


### Using pre-trained pipelines of SparkNLP

In [4]:
from sparknlp.pretrained import PretrainedPipeline

In [4]:
sample_sentences = '''
Arjun is currently working in the field of data science.
Previously he was working in Gulf and had studied his Masters in USA.
He has diverse experiene in oil industry with knowledge of Machine learning, Deep Learning
'''


* Explain Document steps
  * Document assembler
  * Sentence detector
  * Tokenizer
  * Lemmatizer
  * Stemmer
  * Part of speech
  * Spellcheck

In [5]:
pipeline = PretrainedPipeline(name = 'explain_document_ml',
                              lang = 'en')

explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]


In [6]:
result = pipeline.annotate(target = sample_sentences)
result.keys()

dict_keys(['document', 'spell', 'pos', 'lemmas', 'token', 'stems', 'sentence'])

In [7]:
result['sentence']

['Arjun is currently working in the field of data science.',
 'Previously he was working in Gulf and had studied his Masters in USA.',
 'He has diverse experiene in oil industry with knowledge of Machine learning, Deep Learning']

In [8]:
list(zip(result['token'], result['lemmas'], result['pos']))

[('Arjun', 'Arjun', 'NNP'),
 ('is', 'be', 'VBZ'),
 ('currently', 'currently', 'RB'),
 ('working', 'work', 'VBG'),
 ('in', 'in', 'IN'),
 ('the', 'the', 'DT'),
 ('field', 'field', 'NN'),
 ('of', 'of', 'IN'),
 ('data', 'data', 'NNS'),
 ('science', 'science', 'NN'),
 ('.', '.', '.'),
 ('Previously', 'Previously', 'RB'),
 ('he', 'he', 'PRP'),
 ('was', 'be', 'VBD'),
 ('working', 'work', 'VBG'),
 ('in', 'in', 'IN'),
 ('Gulf', 'Gulf', 'NNP'),
 ('and', 'and', 'CC'),
 ('had', 'have', 'VBD'),
 ('studied', 'study', 'VBN'),
 ('his', 'he', 'PRP$'),
 ('Masters', 'Masters', 'NNP'),
 ('in', 'in', 'IN'),
 ('USA', 'USA', 'NNP'),
 ('.', '.', '.'),
 ('He', 'He', 'PRP'),
 ('has', 'have', 'VBZ'),
 ('diverse', 'diverse', 'JJ'),
 ('experiene', 'experience', 'NN'),
 ('in', 'in', 'IN'),
 ('oil', 'oil', 'NN'),
 ('industry', 'industry', 'NN'),
 ('with', 'with', 'IN'),
 ('knowledge', 'knowledge', 'NN'),
 ('of', 'of', 'IN'),
 ('Machine', 'Machine', 'NNP'),
 ('learning', 'learn', 'VBG'),
 (',', ',', ','),
 ('Deep', '

In [9]:
import pandas as pd

In [10]:
pd.DataFrame(data = {'token': result['token'],
                     'pos': result['pos'],
                     'lemmas': result['lemmas']})

Unnamed: 0,token,pos,lemmas
0,Arjun,NNP,Arjun
1,is,VBZ,be
2,currently,RB,currently
3,working,VBG,work
4,in,IN,in
5,the,DT,the
6,field,NN,field
7,of,IN,of
8,data,NNS,data
9,science,NN,science


* Explain Document in Deep Learning steps
  * Document assembler
  * Sentence detector
  * Tokenizer
  * NER (Named Entity Recognition with GLoVe 100D embeddings, CoNLL2003 dataset)
  * Lemmatizer
  * Stemmer
  * Part of speech
  * Spellcheck

In [11]:
pipeline_dl = PretrainedPipeline(name = 'explain_document_dl',
                                 lang = 'en')

explain_document_dl download started this may take some time.
Approx size to download 168.4 MB
[OK!]


In [12]:
result_dl = pipeline_dl.annotate(target = sample_sentences)
result_dl.keys()

dict_keys(['entities', 'stem', 'checked', 'lemma', 'document', 'pos', 'token', 'ner', 'embeddings', 'sentence'])

In [13]:
result_dl['entities']

['Arjun', 'Gulf', 'Masters', 'USA', 'Machine', 'Deep Learning']

In [14]:
pd.DataFrame(data = {'token': result_dl['token'],
                     'ner': result_dl['ner'],
                     'embeddings': result_dl['embeddings']})

Unnamed: 0,token,ner,embeddings
0,Arjun,B-PER,Arjun
1,is,O,is
2,currently,O,currently
3,working,O,working
4,in,O,in
5,the,O,the
6,field,O,field
7,of,O,of
8,data,O,data
9,science,O,science


### Recognize entities DL

In [15]:
recognize_entities_pipeline = PretrainedPipeline(name = 'recognize_entities_dl',
                                                lang = 'en')

recognize_entities_dl download started this may take some time.
Approx size to download 159 MB
[OK!]


In [16]:
result_recognize_entities = recognize_entities_pipeline.annotate(target = sample_sentences)
result_recognize_entities.keys()

dict_keys(['entities', 'document', 'token', 'ner', 'embeddings', 'sentence'])

In [17]:
result_recognize_entities['entities']

['Arjun', 'Gulf', 'Masters', 'USA', 'Machine', 'Deep Learning']

In [18]:
pd.DataFrame(data = {'token': result_recognize_entities['token'],
                     'ner': result_recognize_entities['ner'],
                     'embeddings': result_recognize_entities['embeddings']})

Unnamed: 0,token,ner,embeddings
0,Arjun,B-PER,Arjun
1,is,O,is
2,currently,O,currently
3,working,O,working
4,in,O,in
5,the,O,the
6,field,O,field
7,of,O,of
8,data,O,data
9,science,O,science


### Cleaning Stop Words

In [19]:
clean_stop_words_pipeline = PretrainedPipeline(name = 'clean_stop',
                                               lang = 'en')

clean_stop download started this may take some time.
Approx size to download 12.4 KB
[OK!]


In [20]:
result_stop_words = clean_stop_words_pipeline.annotate(target = sample_sentences)
result_stop_words.keys()

dict_keys(['document', 'sentence', 'token', 'cleanTokens'])

In [21]:
print('Original sentence: \n', result_stop_words['sentence'])
print()
cleaned_sentence = [' '.join(result_stop_words['cleanTokens'])]
print('Cleaned sentence: \n', cleaned_sentence)
print()
print('Stop Words in the sample sentence: ')
[words for words in result_stop_words['token'] if words not in result_stop_words['cleanTokens']]

Original sentence: 
 ['Arjun is currently working in the field of data science.', 'Previously he was working in Gulf and had studied his Masters in USA.', 'He has diverse experiene in oil industry with knowledge of Machine learning, Deep Learning']

Cleaned sentence: 
 ['Arjun currently working field data science . Previously working Gulf studied Masters USA . diverse experiene oil industry knowledge Machine learning , Deep Learning']

Stop Words in the sample sentence: 


['is',
 'in',
 'the',
 'of',
 'he',
 'was',
 'in',
 'and',
 'had',
 'his',
 'in',
 'He',
 'has',
 'in',
 'with',
 'of']

### Cleaning Slang

In [22]:
clean_slang = PretrainedPipeline(name = 'clean_slang',
                                 lang = 'en')

result_clean_slang = clean_slang.annotate('Yo baby, call me ASAP')
result_clean_slang.keys()

clean_slang download started this may take some time.
Approx size to download 21.8 KB
[OK!]


dict_keys(['document', 'token', 'normal'])

In [23]:
result_clean_slang['normal']

['hey', 'baby', 'call', 'me', 'as', 'soon', 'as', 'possible']

In [24]:
[' '.join(result_clean_slang['normal'])]

['hey baby call me as soon as possible']

### Spell Checker

In [25]:
spell_checker_pipeline = PretrainedPipeline(name = 'check_spelling',
                                            lang = 'en')

check_spelling download started this may take some time.
Approx size to download 892.6 KB
[OK!]


In [26]:
sample_sentence_check_spelling = 'I em goong to perty tonight'

In [27]:
result_spell_check = spell_checker_pipeline.annotate(target = sample_sentence_check_spelling)
result_spell_check.keys()

dict_keys(['document', 'sentence', 'token', 'checked'])

In [28]:
print('Original sentence: \n', sample_sentence_check_spelling)
print()
corrected = [' '.join(result_spell_check['checked'])]
print('Spell corrected sentence: \n', corrected)

Original sentence: 
 I em goong to perty tonight

Spell corrected sentence: 
 ['I em gon to party tonight']


* In the above sentence, em is not changed to am, goong is not changed to going

### Spell Checker DL

In [29]:
spell_checker_pipeline_dl = PretrainedPipeline(name = 'check_spelling_dl',
                                               lang = 'en')

check_spelling_dl download started this may take some time.
Approx size to download 112.1 MB
[OK!]


In [30]:
result_spell_check_dl = spell_checker_pipeline_dl.annotate(target = sample_sentence_check_spelling)
result_spell_check_dl.keys()

dict_keys(['document', 'sentence', 'token', 'checked'])

In [31]:
print('Original sentence: \n', sample_sentence_check_spelling)
print()
corrected = [' '.join(result_spell_check_dl['checked'])]
print('Spell corrected sentence: \n', corrected)

Original sentence: 
 I em goong to perty tonight

Spell corrected sentence: 
 ['I me going to Berty tonight']


### Pipeline for lists

In [33]:
sample_sentences_list = ['Arjun is currently working in the field of data science.',
'Previously he was working in Gulf and had studied his Masters in USA.',
'He has diverse experiene in oil industry with knowledge of Machine learning, Deep Learning']


In [34]:
pipeline = PretrainedPipeline(name = 'explain_document_ml',
                              lang = 'en')

explain_document_ml download started this may take some time.
Approx size to download 9.4 MB
[OK!]


In [43]:
result = pipeline.annotate(target = sample_sentences_list)
print('Original sentence: \n', result[1]['document'])
print()
print('Lemma or normalized words: \n', [' '.join(result[1]['lemmas'])])

Original sentence: 
 ['Previously he was working in Gulf and had studied his Masters in USA.']

Lemma or normalized words: 
 ['Previously he be work in Gulf and have study he Masters in USA .']


### Using fullAnnotate

In [44]:
sample_text = 'Peter Parker is a nice guy and lives in New York'

In [46]:
detailed_result = pipeline_dl.fullAnnotate(target = sample_text)

In [48]:
detailed_result[0]['entities']

[Annotation(chunk, 0, 11, Peter Parker, {'entity': 'PER', 'sentence': '0', 'chunk': '0'}),
 Annotation(chunk, 40, 47, New York, {'entity': 'LOC', 'sentence': '0', 'chunk': '1'})]

### Use pre-trained match_chunk Pipeline for Individual Noun Phrase
* Stages
  * Document Assembler
  * Sentence Detector
  * Tokenizer
  * Part of speech
  * Chunker

In [49]:
pipeline_chunk = PretrainedPipeline(name = 'match_chunks',
                                    lang = 'en')

match_chunks download started this may take some time.
Approx size to download 4.3 MB
[OK!]


In [50]:
sample_text_for_chunk = 'This book has many chapters'

result_chunk = pipeline_chunk.annotate(target = sample_text_for_chunk)
result_chunk.keys()

dict_keys(['chunk', 'document', 'pos', 'token', 'sentence'])

In [51]:
result_chunk['chunk']

['This book']

### Extract dates

In [5]:
pipeline_dates = PretrainedPipeline(name = 'match_datetime',
                                    lang = 'en')

match_datetime download started this may take some time.
Approx size to download 12.9 KB
[OK!]


In [8]:
sample_text_dates = 'I have a flight on 20th June'

In [9]:
result_dates = pipeline_dates.annotate(target = sample_text_dates)
result_dates

{'date': ['2020/06/20'],
 'document': ['I have a flight on 20th June'],
 'sentence': ['I have a flight on 20th June'],
 'token': ['I', 'have', 'a', 'flight', 'on', '20th', 'June']}

In [10]:
result_dates['date']

['2020/06/20']

### Sentiment Analysis

#### Vivek algo

In [11]:
sentiment = PretrainedPipeline(name = 'analyze_sentiment',
                               lang = 'en')

analyze_sentiment download started this may take some time.
Approx size to download 4.9 MB
[OK!]


In [12]:
result_sentiment = sentiment.annotate(target = 'The movie watched was not that good')
result_sentiment['sentiment']

['negative']

#### DL version (trained on imbd)

In [14]:
sentiment_imbd = PretrainedPipeline(name = 'analyze_sentimentdl_use_imdb',
                                    lang = 'en')

analyze_sentimentdl_use_imdb download started this may take some time.
Approx size to download 935.8 MB
[OK!]


In [15]:
comment = '''
It's a very scary film but what impressed me was how true the film sticks to the original's tricks; it isn't filled with loud in-your-face jump scares, in fact, a lot of what makes this film scary is the slick cinematography and intricate shadow play. The use of lighting and creation of atmosphere is what makes this film so tense, which is why it's perfectly suited for those who like Horror movies but without the obnoxious gore.
'''
result_sentiment_imdb = sentiment_imbd.annotate(target = comment)

result_sentiment_imdb.keys()

dict_keys(['document', 'sentence_embeddings', 'sentiment'])

In [16]:
result_sentiment_imdb['sentiment']

['positive']

#### DL version (trained on twitter dataset)

In [17]:
sentiment_twitter = PretrainedPipeline(name = 'analyze_sentimentdl_use_twitter',
                                       lang = 'en')

analyze_sentimentdl_use_twitter download started this may take some time.
Approx size to download 928.3 MB
[OK!]


In [18]:
result_sentiment_twitter = sentiment_twitter.annotate(target = 'The movie watched was not that good')
result_sentiment_twitter['sentiment']

['negative']