# Assignment 1

**Due to**: 11/01/2022 (dd/mm/yyyy)

If you deliver it by 11/12/2021 your assignment will be graded by 11/01/2022.


**Credits**: Andrea Galassi, Federico Ruggeri, Paolo Torroni

**Summary**: Part-of Speech (POS) tagging as Sequence Labelling using Recurrent Neural Architectures

# Execution



In [212]:
import nltk

import re
import pandas as pd
import numpy as np

import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences

nltk.download('treebank')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [168]:
def pre_process(df,string):
  def text_pre_process(text):
      ret = re.sub("RT @(.)+?:\s|(&#[0-9]+;)|@([\w\-]+)|(#)\S+|(http)s?\S+|&gt;|^\s+|\b\s+|\n", "", text)
      ret = re.sub("\s\s+|[^a-zA-Z\d\s:]" , " ", ret).rstrip().lower()
      return ret
  return df[string].apply(text_pre_process)

def Encoding(df,Tags):
  label_encoder = sklearn.preprocessing.LabelEncoder()
  X_lab = label_encoder.fit_transform(df)
  OneHot_encoder = sklearn.preprocessing.OneHotEncoder()
  X = OneHot_encoder.fit_transform(X_lab.reshape(-1,1)).toarray()
  dfOneHot = pd.DataFrame(X, columns = [i for i in Tags])
  df.reset_index(inplace=True,drop=True)
  df = pd.concat([df,dfOneHot],axis=1)
  return df

def create_embedding_matrix(filepath, word_index, embedding_dim):
  vocab_size = len(word_index)+1
  embedding_matrix = np.zeros((vocab_size,embedding_dim))

  with open(filepath, encoding='utf-8') as f:
    for line in f:
      word, *vector = line.split()
      if word in word_index:
        idx = word_index[word]
        embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
  return embedding_matrix

## 1. Corpus
NOTA: MOLTE COSE NON HANNO SENSO STO SPERIMENTANDO FUNZIONI PER VEDERE DI CAPIRCI QUALCOSA.
### 1.1 Pre-processing

In [195]:
# Get the files' list
fileids = nltk.corpus.treebank.fileids()

# Get the Penn Treebank corpus and tokenize the text
train_corpus = nltk.corpus.treebank.tagged_sents(fileids[:100])
val_corpus = nltk.corpus.treebank.tagged_sents(fileids[100:150])
test_corpus = nltk.corpus.treebank.tagged_sents(fileids[150:])

# Flatten the lists
train_corpus = [item for sublist in train_corpus for item in sublist if item[1] != '-NONE-']
val_corpus = [item for sublist in val_corpus for item in sublist if item[1] != '-NONE-']
test_corpus = [item for sublist in test_corpus for item in sublist if item[1] != '-NONE-']

In [200]:
train_df = pd.DataFrame(train_corpus, columns = ['word', 'tag'])

train_df['word'] = pre_process(train_df,'word')
train_df['tag'] = pre_process(train_df,'tag')

train_df['word'].replace('', np.nan, inplace=True)
train_df['word'].replace(':', np.nan, inplace=True)
train_df['tag'].replace('', np.nan, inplace=True)
train_df.dropna(inplace=True)

train_df.describe()

Unnamed: 0,word,tag
count,41144,41144
unique,7362,35
top,the,nn
freq,2329,6117


In [201]:
val_df = pd.DataFrame(val_corpus, columns = ['word', 'tag'])

val_df['word'] = pre_process(val_df,'word')
val_df['tag'] = pre_process(val_df,'tag')

val_df['word'].replace('', np.nan, inplace=True)
val_df['word'].replace(':', np.nan, inplace=True)
val_df['tag'].replace('', np.nan, inplace=True)
val_df.dropna(inplace=True)

val_df.describe()

Unnamed: 0,word,tag
count,27262,27262
unique,5388,35
top,the,nn
freq,1670,4299


In [202]:
test_df = pd.DataFrame(test_corpus, columns = ['word', 'tag'])

test_df['word'] = pre_process(test_df,'word')
test_df['tag'] = pre_process(test_df,'tag')

test_df['word'].replace('', np.nan, inplace=True)
test_df['word'].replace(':', np.nan, inplace=True)
test_df['tag'].replace('', np.nan, inplace=True)
test_df.dropna(inplace=True)

test_df.describe()

Unnamed: 0,word,tag
count,13613,13613
unique,3386,32
top,the,nn
freq,765,2305


In [203]:
# tags_train = sorted(list(set([x[1] for x in train_corpus])))
# tags_train = zip(tags_train, list(range(len(tags_train))))
# tags_train_dict = dict(tags_train)

# tags_val = sorted(list(set([x[1] for x in val_corpus])))
# tags_val = zip(tags_val, list(range(len(tags_val))))
# tags_val_dict = dict(tags_val)

# tags_test = sorted(list(set([x[1] for x in test_corpus])))
# tags_test = zip(tags_test, list(range(len(tags_test))))
# tags_test_dict = dict(tags_test)

# print(len(tags_train_dict))
# print(len(tags_test_dict))
# print(len(tags_val_dict))

tags = []
for i in train_df.tag:
  if i not in tags:
    tags.append(i)

tags_test = []
for i in test_df.tag:
  if i not in tags_test:
    tags_test.append(i)

tags_val = []
for i in val_df.tag:
  if i not in tags_val:
    tags_val.append(i)

print(tags_val)
print(tags_test)
print(tags)

['dt', 'nnp', 'nn', 'vbd', 'jj', 'nns', 'in', 'jjr', 'cd', 'wdt', 'vbz', 'rb', 'cc', 'vbg', 'vbn', 'vbp', 'jjs', 'to', 'pos', 'rbr', 'md', 'vb', 'nnps', 'prp', ' lrb', ' rrb', 'wp', 'rbs', 'ex', 'wrb', 'rp', 'fw', 'uh', 'pdt', 'ls']
['nnp', 'vbd', 'prp', 'cd', 'nns', 'cc', 'in', 'jj', 'nn', 'dt', 'vbz', 'pos', 'to', 'md', 'vb', 'vbg', 'rb', 'vbn', 'wp', 'vbp', 'wrb', 'wdt', 'jjr', 'jjs', 'rp', 'nnps', 'rbr', 'ex', 'rbs', ' lrb', ' rrb', 'pdt']
['nnp', 'cd', 'nns', 'jj', 'md', 'vb', 'dt', 'nn', 'in', 'vbz', 'vbg', 'cc', 'vbd', 'vbn', 'rb', 'to', 'prp', 'rbr', 'wdt', 'vbp', 'rp', 'jjs', 'pos', 'ex', 'wp', 'jjr', 'wrb', 'nnps', ' lrb', ' rrb', 'pdt', 'rbs', 'fw', 'uh', 'ls']


In [206]:
label_encoder = LabelEncoder()

train_df['tag'] = label_encoder.fit_transform(train_df['tag'])
test_df['tag']  = label_encoder.fit_transform(test_df['tag'])
val_df['tag']   = label_encoder.fit_transform(val_df['tag'])

print(train_df['tag'].unique())
print(val_df['tag'].unique())
print(test_df['tag'].unique())

[14  3 16  8 12 26  4 13  7 31 28  2 27 29 20 24 19 21 32 30 23 10 18  5
 33  9 34 15  0  1 17 22  6 25 11]
[ 4 14 13 27  8 16  7  9  3 32 31 20  2 28 29 30 10 24 18 21 12 26 15 19
  0  1 33 22  5 34 23  6 25 17 11]
[12 24 17  3 14  2  6  7 11  4 28 16 22 10 23 25 18 26 30 27 31 29  8  9
 21 13 19  5 20  0  1 15]


In [209]:
X_train = train_df.word
y_train = train_df.tag
y_train = Encoding(y_train, tags)

print(y_train.shape)
print(y_train.head())

X_val = val_df.word
y_val = val_df.tag
y_val = Encoding(y_val, tags_val)
print(y_val.shape)
print(y_val.head())

X_test = test_df.word
y_test = test_df.tag
y_test = Encoding(y_test, tags_test)
print(y_test.shape)
print(y_test.head())

y_train.drop('tag',inplace=True,axis=1)
y_test.drop('tag',inplace=True,axis=1)
y_val.drop('tag',inplace=True,axis=1)

(41144, 36)
   tag  nnp   cd  nns   jj   md   vb   dt   nn   in  ...  jjr  wrb  nnps  \
0   14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   0.0   
1   14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   0.0   
2    3  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   0.0   
3   16  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0   0.0   
4    8  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...  0.0  0.0   0.0   

    lrb   rrb  pdt  rbs   fw   uh   ls  
0   0.0   0.0  0.0  0.0  0.0  0.0  0.0  
1   0.0   0.0  0.0  0.0  0.0  0.0  0.0  
2   0.0   0.0  0.0  0.0  0.0  0.0  0.0  
3   0.0   0.0  0.0  0.0  0.0  0.0  0.0  
4   0.0   0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 36 columns]
(27262, 36)
   tag   dt  nnp   nn  vbd   jj  nns   in  jjr   cd  ...   rrb   wp  rbs   ex  \
0    4  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...   0.0  0.0  0.0  0.0   
1   14  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0  0.0  0.0  0.0   
2   13 

In [213]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

num_words = 9000
tokenizer.word_index = {e:i for e,i in tokenizer.word_index.items() if i <= num_words}
tokenizer.word_index[tokenizer.oov_token] = num_words +1

X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)
vocab_size = len(tokenizer.word_index) + 1

In [214]:
max_len = 50
X_train = pad_sequences(X_train,padding='post',maxlen=max_len)
X_val = pad_sequences(X_val,padding='post',maxlen=max_len)
X_test = pad_sequences(X_test,padding='post',maxlen=max_len)

###1.2 GloVe 

In [84]:
import gensim
import gensim.downloader as gloader

emb_model = gloader.load('glove-wiki-gigaword-50')



In [100]:
emb_model.most_similar('iphone')

[('ipad', 0.9405525326728821),
 ('smartphone', 0.9002013206481934),
 ('ipod', 0.8585914969444275),
 ('android', 0.8474240303039551),
 ('smartphones', 0.8404892683029175),
 ('macintosh', 0.8069723844528198),
 ('3gs', 0.7897009253501892),
 ('playstation', 0.7869682312011719),
 ('handsets', 0.7849521636962891),
 ('app', 0.7825644016265869)]

In [215]:
input_dim = X_train.shape[1]
embedding_dim = 300
embedding_matrix = create_embedding_matrix("/content/Glove/glove.6B.300d.txt", tokenizer.word_index, embedding_dim)

FileNotFoundError: ignored

### 1.3 Model

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

# Define the model
model = tf.keras.Sequential()

# Add the Embedding layer
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))

# Add the Bidirectional LSTM layer
model.add(Bidirectional(LSTM(units=lstm_units)))

# Add the Dense/Fully-Connected layer
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])