# Language models. N-grams


## Types of Language Models
There are primarily two types of Language Models:

1. Statistical Language Models: These models use traditional statistical techniques like N-grams, Hidden Markov Models (HMM) and certain linguistic rules to learn the probability distribution of words
2. Neural Language Models: These use different kinds of Neural Networks to model language


# Statistical Language Model

## Model implementation

In [1]:
#import nltk
#nltk.download('reuters', quiet=True)

In [2]:
%%capture 

import subprocess
import nltk
# Download and unzip reuters
try:
    nltk.data.find('reuters.zip')
except:
    nltk.download('reuters', download_dir='/kaggle/working/', quiet=True, force=True)
    command = "unzip /kaggle/working/corpora/reuters.zip -d /kaggle/working/corpora"
    result = subprocess.run(command.split(), capture_output = True, text = True )
    nltk.data.path.append('/kaggle/working/')


In [3]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

# Create a placeholder for model
model = defaultdict(lambda: defaultdict(lambda: 0))

# Count frequency of co-occurance  
for sentence in reuters.sents():
    lower_sentence = [word.lower() for word in sentence]
    for word1, word2, word3 in trigrams(lower_sentence, pad_right=True, pad_left=True):
        model[(word1, word2)][word3] += 1
        
        
# Let's transform the counts to probabilities
for word1_word2 in model:
    total_count = float(sum(model[word1_word2].values()))
    for word3 in model[word1_word2]:
        model[word1_word2][word3] /= total_count

In [4]:
import random

# starting words
text = ["today", "the"]
sentence_finished = False
 
while not sentence_finished:
    # select a random probability threshold  
    probability_threshold = random.random()
    accumulator = .0

    for word in model[tuple(text[-2:])].keys():
        accumulator += model[tuple(text[-2:])][word]
        
        # select words that are above the probability threshold
        if accumulator >= probability_threshold:
            text.append(word)
            break

    if text[-2:] == [None, None]:
        sentence_finished = True

print (' '.join([t for t in text if t]))

today the emirate ' s u . s .


## Alternative model implementation

In [5]:
import string
from nltk.corpus import stopwords
from nltk import FreqDist


#  remove the n-grams with removable words
def remove_stopwords(ngrams, removal_list):     
    y = []
    for pair in ngrams:
        count = 0
        for word in pair:
            if word in removal_list:
                count = count or 0
            else:
                count = count or 1
        if (count==1):
            y.append(pair)
    return (y)

def pick_word(counter):
    "Chooses a random element."
    return random.choice(list(counter.elements()))


In [6]:
# input the reuters sentences
sents  = reuters.sents()
  
# write the removal characters such as : Stopwords and punctuation
stop_words = set(stopwords.words('english'))
string.punctuation = string.punctuation +'"'+'"'+'-'+'''+'''+'—'
string.punctuation
removal_list = list(stop_words) + list(string.punctuation)+ ['lt','rt']

In [7]:
# generate unigrams bigrams trigrams
unigram=[]
bigram=[]
trigram=[]
tokenized_text=[]
for sentence in sents:
    sentence = list(map(lambda x:x.lower(),sentence))
    for word in sentence:
        if word== '.':
            sentence.remove(word) 
        else:
            unigram.append(word)
    
    tokenized_text.append(sentence)
    bigram.extend(list(nltk.ngrams(sentence, 2,pad_left=True, pad_right=True)))
    trigram.extend(list(nltk.ngrams(sentence, 3, pad_left=True, pad_right=True)))

In [8]:
unigram = remove_stopwords(unigram, removal_list)
bigram = remove_stopwords(bigram,removal_list)
trigram = remove_stopwords(trigram,removal_list)
  

In [9]:
# generate frequency of n-grams 
freq_bi = FreqDist(bigram)
freq_tri = FreqDist(trigram)

In [10]:
d = defaultdict(Counter)
for a, b, c in freq_tri:
    if(a != None and b!= None and c!= None):
        d[(a, b)] += {(a, b, c) : freq_tri[a, b, c]}

In [11]:
# Next word prediction      
s=''

prefix = "today", "the"
print(" ".join(prefix))
s = " ".join(prefix)
for i in range(19):
    suffix = pick_word(d[prefix])[-1]
    s=s+' '+suffix
    print(s)
    prefix = prefix[1], suffix

today the

today the emirate

today the emirate '

today the emirate ' s

today the emirate ' s graphic

today the emirate ' s graphic arts

today the emirate ' s graphic arts group

today the emirate ' s graphic arts group for

today the emirate ' s graphic arts group for about

today the emirate ' s graphic arts group for about 1

today the emirate ' s graphic arts group for about 1 ,

today the emirate ' s graphic arts group for about 1 , 816

today the emirate ' s graphic arts group for about 1 , 816 tonnes

today the emirate ' s graphic arts group for about 1 , 816 tonnes the

today the emirate ' s graphic arts group for about 1 , 816 tonnes the previous

today the emirate ' s graphic arts group for about 1 , 816 tonnes the previous quarter

today the emirate ' s graphic arts group for about 1 , 816 tonnes the previous quarter ,

today the emirate ' s graphic arts group for about 1 , 816 tonnes the previous quarter , the

today the emirate ' s graphic arts group for about 1 , 816 

## Task

Task is to classify medical transcription into types of medical speciality with use of **N-grams**. 

In [12]:
import pandas as pd

train_df = pd.read_csv("train.csv")
train_df.head()

Unnamed: 0,id,medical_specialty,transcription
0,0,Cardiovascular / Pulmonary,"PREOPERATIVE DIAGNOSIS: , Persistent pneumonia..."
1,1,General Medicine,"REASON FOR VISIT: , Mr. ABC is a 30-year-old m..."
2,2,Cardiovascular / Pulmonary,"REASON FOR CONSULTATION: , Mesothelioma.,HISTO..."
3,3,General Medicine,"DISCHARGE DIAGNOSES:,1. Chronic obstructive pu..."
4,4,Cardiovascular / Pulmonary,"CHIEF COMPLAINT:, The patient complains of che..."


## Preprocessing


In [13]:
train_df = train_df[train_df["transcription"].notna()].astype(str)

In [14]:
def preprocess(text: pd.Series, tokenizer, stemmer, stop):
    lower = text.apply(lambda x: x.lower())
    tokenized = lower.apply(lambda x: tokenizer.tokenize(x))
    stemmed = tokenized.apply(lambda x: [stemmer.stem(word) for word in x])
    cleaned_text = stemmed.apply(
        lambda x: " ".join(word for word in x if word not in stop)
    )
    return cleaned_text

In [15]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

tokenizer = RegexpTokenizer(r"\w+")
stemmer = SnowballStemmer("english")
stop = stopwords.words("english")

train_df["transcription"] = preprocess(
    train_df["transcription"], tokenizer, stemmer, stop
)

In [16]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(train_df["medical_specialty"])
train_df["class_id"] = le.transform(train_df["medical_specialty"])
train_df.head()

Unnamed: 0,id,medical_specialty,transcription,class_id
0,0,Cardiovascular / Pulmonary,preoper diagnosi persist pneumonia right upper...,0
1,1,General Medicine,reason visit mr abc 30 year old man return fol...,3
2,2,Cardiovascular / Pulmonary,reason consult mesothelioma histori present il...,0
3,3,General Medicine,discharg diagnos 1 chronic obstruct pulmonari ...,3
4,4,Cardiovascular / Pulmonary,chief complaint patient complain chest pain hi...,0


In [17]:
x_train = list(train_df["transcription"].values)
y_train = list(train_df["class_id"].values)
x_train[0], y_train[0]

('preoper diagnosi persist pneumonia right upper lobe lung possibl mass postop diagnosi persist pneumonia right upper lobe lung possibl mass procedur bronchoscopi brush biopsi descript procedur obtain inform consent patient taken oper room underw general endotrach anesthesia time process follow flexibl bronchoscop insert endotrach tube 2 cc 4 lidocain infus endotrach tube first trachea carina normal appear scope pass left side bronchial system found normal scar mucoid secret scope pass right side brown secret obtain collect trap sent cultur sensit aerob anaerob fungi tb first basal lobe explor found normal right upper lobe select cannul abnorm found except secret aspir bronchi go three segment visual abnorm mass found brush biopsi obtain one segment sent patholog procedur interrupt sever time becaus patient desatur minut ambu bag recov satisfactorili end patient toler procedur well sent recoveri room satisfactori condit',
 0)

## N-gram feature extraction


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

n_grams = {"unigram": (1, 1), "bigram": (2, 2), "trigram": (3, 3), 'unigram_bigram': (1, 2)}

In [19]:
features = {}
for n_gram_name, n_gram in n_grams.items():
   vectorizer = CountVectorizer(ngram_range=n_gram)
   features[n_gram_name] = (vectorizer, vectorizer.fit_transform(x_train))

In [20]:
features

{'unigram': (CountVectorizer(),
  <1963x13181 sparse matrix of type '<class 'numpy.int64'>'
  	with 349658 stored elements in Compressed Sparse Row format>),
 'bigram': (CountVectorizer(ngram_range=(2, 2)),
  <1963x221785 sparse matrix of type '<class 'numpy.int64'>'
  	with 513118 stored elements in Compressed Sparse Row format>),
 'trigram': (CountVectorizer(ngram_range=(3, 3)),
  <1963x358434 sparse matrix of type '<class 'numpy.int64'>'
  	with 547708 stored elements in Compressed Sparse Row format>),
 'unigram_bigram': (CountVectorizer(ngram_range=(1, 2)),
  <1963x234966 sparse matrix of type '<class 'numpy.int64'>'
  	with 862776 stored elements in Compressed Sparse Row format>)}

In [21]:
features['unigram'][0].get_feature_names_out()[-50:]

array(['yo', 'yogurt', 'york', 'young', 'younger', 'youngster',
       'youngswick', 'yr', 'yrs', 'yueh', 'yy', 'yyyi', 'zag', 'zanaflex',
       'zantac', 'zaontz', 'zaroxolyn', 'zegerid', 'zeiss', 'zero',
       'zestril', 'zeta', 'zetia', 'ziac', 'ziagen', 'zig', 'zigzag',
       'zimmer', 'zinc', 'zing', 'zithromax', 'zocor', 'zofran', 'zoloft',
       'zolpidem', 'zone', 'zonegran', 'zoonot', 'zoster', 'zosyn',
       'zuba', 'zumi', 'zygoma', 'zygomat', 'zyloprim', 'zyprexa',
       'zyrtec', 'zyvox', 'µiu', 'µl'], dtype=object)

## Classification

In [22]:
from sklearn.neighbors import KNeighborsClassifier
clf= KNeighborsClassifier(5) 
clf.fit(features['unigram'][1], y_train)

In [23]:
from sklearn.metrics import f1_score

f1_score(y_train, clf.predict(features['unigram'][1]), average='macro')

0.6380662701151707

### Predict

In [24]:
test_df = pd.read_csv('test.csv')
test_df.head()

Unnamed: 0,id,transcription
0,0,"INDICATIONS FOR PROCEDURE:, The patient has pr..."
1,1,"CLINICAL HISTORY: ,This 78-year-old black woma..."
2,2,"PREOPERATIVE DIAGNOSIS: , Penoscrotal abscess...."
3,3,"INDICATIONS:, Ischemic cardiomyopathy, status..."
4,4,"PREOPERATIVE DIAGNOSIS: , Ruptured distal bice..."


In [25]:
test_df["transcription"] = preprocess(
    test_df["transcription"], tokenizer, stemmer, stop
)
test_df.head()

Unnamed: 0,id,transcription
0,0,indic procedur patient present atyp type right...
1,1,clinic histori 78 year old black woman histori...
2,2,preoper diagnosi penoscrot abscess postop diag...
3,3,indic ischem cardiomyopathi status post inferi...
4,4,preoper diagnosi ruptur distal bicep tendon ri...


In [26]:
x_test= list(test_df["transcription"].values)
unigrams = features['unigram'][0].transform(x_test)
unigrams.shape
y_pred = clf.predict(unigrams)

In [27]:
test_df['class_id'] = y_pred
test_df.head()

Unnamed: 0,id,transcription,class_id
0,0,indic procedur patient present atyp type right...,0
1,1,clinic histori 78 year old black woman histori...,4
2,2,preoper diagnosi penoscrot abscess postop diag...,4
3,3,indic ischem cardiomyopathi status post inferi...,0
4,4,preoper diagnosi ruptur distal bicep tendon ri...,4
