# Coding Tasks 

In [1]:
# Setup connection to Google Drive
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/My Drive/code-archive"
!ls

Mounted at /content/drive/
/content/drive/My Drive/code-archive
'“analysis corpur embeddings.ipynb”的副本'   postag.py
 assesment_paper.txt			    'sentence classification'
 cyberpunk.ipynb			     sst2.py
 dataset				     “twitter_analysis.ipynb”的副本
'document classification.ipynb'		     wandb
 logs


# loading file

In [2]:
with open ("assesment_paper.txt") as f:
  data = f.read()

# Task1: Warm-up 

In [97]:
tokens = [token for token in data.split(" ") if token!=" "]
token_start_clinic = [token for token in tokens if token.startswith("clinic")]
print(f"Count of words that start with “clinic”: {len(token_start_clinic)}")

Count of words that start with “clinic”: 10


In [98]:
token_digit = [token for token in tokens if token.isdigit()]
print(f"Count of numbers in the text: {len(token_digit)}")

Count of numbers in the text: 140


In [106]:
token_follow_t = [token for token in tokens if len(token)>1 and token[-1]=='T' and token[:-1].isdigit()]
print(f"Count of numbers followed by “T” (standing for Tesla, the unit of measure): {len(token_follow_t)}")
# An inspection of the corpus indicates there might be space between therefore:
token_follow_t_space=[' '.join([tokens[i-1], token]) for i, token in enumerate(tokens) if token=='T' and tokens[i-1].isdigit()]
print(token_follow_t_space)
token_follow_t_all = token_follow_t + token_follow_t_space
print(f"Count of numbers followed by “T” (standing for Tesla, the unit of measure): {len(token_follow_t_all)}")

Count of numbers followed by “T” (standing for Tesla, the unit of measure): 1
['3 T', '3 T', '3 T', '3 T', '3 T', '3 T', '3 T', '3 T']
Count of numbers followed by “T” (standing for Tesla, the unit of measure): 9


In [107]:
from collections import Counter
most_common = Counter(tokens).most_common(1)
print(f"The most common word is {most_common[0][0]}, with {most_common[0][1]} occurrences.")

The most common word is , with 770 occurrences.


## **Task1 results (incase the console does not show anything :) )**

Count of words that start with “clinic”: 10

Count of numbers in the text: 140

Count of numbers followed by “T” (standing for Tesla, the unit of measure): 9

The most common word is , with 770 occurrences.



# Task2: Understanding the grammar 

In [108]:
import nltk
nltk.download('averaged_perceptron_tagger')
clinic_noun = [item for item in nltk.pos_tag(token_start_clinic) if item[1]=='NN' or item[1]=='NNS']
print(f"Count of words that start with “clinic” and that are a noun/noun phrase {len(clinic_noun)}")

Count of words that start with “clinic” and that are a noun/noun phrase 3


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [113]:
import spacy
from nltk.corpus.reader.tagged import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm')
  

s1 = "Another major challenge of lung imaging is the respiratory motion that induces artifacts."
s2 = "While breath-hold acquisitions are commonly performed at end-inspiration, free-breathing  acquisitions are usually performed at tidal volumes."

def common_form(s):
    lemmatized = [lemmatizer.lemmatize(item) for item in nltk.word_tokenize(s)]
    lemmatized = ' '.join(lemmatized)
    print(f"original sentence: {s}")
    print(f"lemmatized sentence: {lemmatized}")
    return

def get_sub_obj(s):
    sub = [tok for tok in nlp(s) if "subj" in tok.dep_ ]
    obj = [tok for tok in nlp(s) if "obj" in tok.dep_ ] 
    print(f"subjects: {sub}, objects: {obj}")
    return
  

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [114]:
common_form(s1)
common_form(s2)
get_sub_obj(s1)
get_sub_obj(s2)

original sentence: Another major challenge of lung imaging is the respiratory motion that induces artifacts.
lemmatized sentence: Another major challenge of lung imaging is the respiratory motion that induces artifact .
original sentence: While breath-hold acquisitions are commonly performed at end-inspiration, free-breathing  acquisitions are usually performed at tidal volumes.
lemmatized sentence: While breath-hold acquisition are commonly performed at end-inspiration , free-breathing acquisition are usually performed at tidal volume .
subjects: [challenge, that], objects: [imaging, artifacts]
subjects: [acquisitions, acquisitions], objects: [inspiration, volumes]


## **Task2 results**

Count of words that start with “clinic” and that are a noun/noun phrase 3

lemmatized s1: Another major challenge of lung imaging is the respiratory motion that induces artifact .

lemmatized s2: While breath-hold acquisition are commonly performed at end-inspiration , free-breathing acquisition are usually performed at tidal volume .

s1: subjects: [challenge, that], objects: [imaging, artifacts]

s2: subjects: [acquisitions, acquisitions], objects: [inspiration, volumes]

# Task3: Identifying entities

In [112]:
from spacy import displacy
entity_sent = "The UK tradition of eating fish battered and fried in oil was introduced to the country by Spanish and  Portuguese Jewish immigrants, who spent time in the Netherlands before settling in the UK as early as the  16th century."
doc = nlp(entity_sent)
#displacy.serve(doc, style="ent")
for ent in doc.ents:
    print(ent.text, ent.label_)

UK GPE
Spanish NORP
Portuguese Jewish NORP
Netherlands GPE
UK GPE
the  16th century DATE


In [44]:
loc = [(ent.text,ent.label_) for ent in nlp(" ".join(tokens)).ents]
all_entity_type = [item[1] for item in loc] 
count = Counter(all_entity_type)
print(count)
for k,v in count.items():
    print(f"The paper contains {v} times of {k}")



Counter({'CARDINAL': 300, 'ORG': 292, 'PERSON': 194, 'GPE': 80, 'PERCENT': 31, 'QUANTITY': 23, 'DATE': 21, 'PRODUCT': 15, 'NORP': 15, 'ORDINAL': 7, 'MONEY': 5, 'TIME': 2, 'FAC': 2, 'LOC': 1, 'LAW': 1, 'WORK_OF_ART': 1})
The paper contains 292 times of ORG
The paper contains 194 times of PERSON
The paper contains 80 times of GPE
The paper contains 300 times of CARDINAL
The paper contains 15 times of PRODUCT
The paper contains 31 times of PERCENT
The paper contains 23 times of QUANTITY
The paper contains 21 times of DATE
The paper contains 15 times of NORP
The paper contains 2 times of TIME
The paper contains 1 times of LOC
The paper contains 7 times of ORDINAL
The paper contains 5 times of MONEY
The paper contains 2 times of FAC
The paper contains 1 times of LAW
The paper contains 1 times of WORK_OF_ART


In [47]:
! python -m spacy download en_core_web_lg

2022-12-13 08:58:35.282779: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[K     |████████████████████████████████| 587.7 MB 13 kB/s 
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.4.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [53]:
import en_core_web_lg
nlp = en_core_web_lg.load()
target = nlp("study")

most_similar = {token: target.similarity(nlp(token)) for token in tokens}
ordered = {k: v for k, v in sorted(most_similar.items(), key=lambda item: item[1], reverse=True)}


  most_similar = {token: target.similarity(nlp(token)) for token in tokens}


In [62]:
keys = list(ordered.keys())
lemma = [word for word in keys if lemmatizer.lemmatize(word)!=lemmatizer.lemmatize("study") and word[:5]!='study']
print(f"Most similar words to “study” are (descending order): {lemma[0]}, {lemma[1]}, {lemma[2]}")

Most similar words to “study” are (descending order): research, analysis, analyses


## **Task3 results**

UK GPE

Spanish NORP

Portuguese Jewish NORP

Netherlands GPE

UK GPE

the  16th century DATE

========================================================
The paper contains 292 times of ORG

The paper contains 194 times of PERSON

The paper contains 80 times of GPE

The paper contains 300 times of CARDINAL

The paper contains 15 times of PRODUCT

The paper contains 31 times of PERCENT

The paper contains 23 times of QUANTITY

The paper contains 21 times of DATE

The paper contains 15 times of NORP

The paper contains 2 times of TIME

The paper contains 1 times of LOC

The paper contains 7 times of ORDINAL

The paper contains 5 times of MONEY

The paper contains 2 times of FAC

The paper contains 1 times of LAW

The paper contains 1 times of WORK_OF_ART

================================================================================

Most similar words to “study” are (descending order): research, analysis, analyses


# Task4: Summarizing content 

In [78]:
nlp = spacy.load('en_core_web_sm')
summarize_sent1= "They baked the pizza with pineapples."
summarize_sent2= "A respiratory physiotherapist trains a volunteer on how to position himself and behave to  perform the HF-NIV, which requires a monitor, a Phasitron and a nose clip."

def summary(s):
    sent = nlp(s)
    root_token = [sentence.root for sentence in sent.sents][0]
    for child in root_token.children:
      if child.dep_ == 'nsubj':
         subj = child
      if child.dep_ == 'dobj':
         obj = child
    return ' '.join([subj.text, root_token.text, obj.text])
print(summary(summarize_sent1))
print(summary(summarize_sent2))


They baked pizza
physiotherapist trains volunteer


## **Task4 results**

They baked pizza

physiotherapist trains volunteer