<a href="https://colab.research.google.com/github/ahmadhajmosa/Machine-learning-labs/blob/master/ADR_Keras.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



```
# This is formatted as code
```

# Installations

In [127]:
!pip install -U spacy
!python -m spacy download en_core_web_lg
!python -m spacy download en_core_web_sm

!python -m spacy download en_core_web_trf
!pip install textacy

Collecting en-core-web-lg==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.3.0/en_core_web_lg-3.3.0-py3-none-any.whl (400.7 MB)
[K     |████████████████████████████████| 400.7 MB 5.8 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
Collecting en-core-web-sm==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.3.0/en_core_web_sm-3.3.0-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 4.9 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Collecting en-core-web-trf==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_trf-3.3.0/en_core_web_trf-3.3.0-py3-none-any.whl (460.3 MB)
[K     |████████████████████████████████| 460.3 MB 22 kB/s 
[38;5;2m✔ Download and installation successful[0m
You

# Sample input data

In [144]:
txt_s = "Hi my name is Ahmad Haj Mosa, I am from Austria. I earn 2.3 Euro. Hi I am Khaled, I paid Ahmad 10 Euros. I work at PwC AT. My country code is AT."
pattern = [{'label': 'Money', 'pattern': '2.3 Euro'}, {'label': 'PERSON', 'pattern': 'Ahmad Haj Mosa'}, {'label': 'GPE', 'pattern': 'Austria'} ]


# Load packages and models

In [145]:
import spacy
from sklearn.metrics.pairwise import cosine_similarity
from nltk import ngrams
from spacy.tokens import Span
from spacy import displacy
import textacy
import numpy as np
from sklearn.decomposition import PCA
import pickle
from sklearn.preprocessing import OneHotEncoder
#pca = pickle.load(open('eng_pca.pkl', 'rb'))

# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_trf")
nlp_sim = spacy.load("en_core_web_lg")

# Initialize the matcher 

In [146]:
from spacy.matcher import PhraseMatcher
doc = nlp_sim(txt_s)
matcher = PhraseMatcher(nlp_sim.vocab)


In [147]:
patterns_dict = dict()

for pt in pattern:
  patterns_dict[pt['pattern']] = pt['label']
  matcher.add(pt['label'], [nlp_sim(pt['pattern'])])
matches = matcher(doc)
matches

[(380, 4, 7), (384, 11, 12), (16901899789441075350, 15, 17)]

# Features Extraction

## Dependency parser OneHotEncoder

In [148]:
tag_lst = nlp_sim.pipe_labels['parser']
dep_to_id = []
dep_to_id_dict = dict()

id = 0
for tg in tag_lst:
  dep_to_id.append([tg,id])
  dep_to_id_dict[tg] = id
  id+=1
dep_enc = OneHotEncoder(handle_unknown='ignore')
dep_enc.fit(dep_to_id)

OneHotEncoder(handle_unknown='ignore')

## POS OneHotEncoder

In [149]:
tag_lst = ['ADJ',
'ADP',
'ADV',
'AUX',
'CONJ',
'CCONJ',
'DET',
'INTJ',
'NOUN',
'NUM',
'PART',
'PRON',
'PROPN',
'PUNCT',
'SCONJ',
'SYM',
'VERB',
'X',
'SPACE']
pos_to_id = []
pos_to_id_dict = dict()

id = 0
for tg in tag_lst:
  pos_to_id.append([tg,id])
  pos_to_id_dict[tg] = id
  id+=1
pos_enc = OneHotEncoder(handle_unknown='ignore')
pos_enc.fit(pos_to_id)
pos_enc.transform([['PROPN', pos_to_id_dict['PROPN']]]).toarray()



array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0.]])

## Features Extraction Function

In [150]:
def features_extraction(matches, doc):
    ground_t_v = []
    ground_t_y = []
    for match in matches:


        tokens =  doc[match[1]:match[2]].as_doc().to_json()['tokens']
        vectors = doc[match[1]:match[2]].as_doc().to_dict()['tensor']
        lb = patterns_dict[doc[match[1]:match[2]].text]

        for tok_i, tok in enumerate(tokens):
            pos_vec = pos_enc.transform([[tok['pos'], pos_to_id_dict[tok['pos']]]]).toarray()
            dep_vec = dep_enc.transform([[tok['dep'], dep_to_id_dict[tok['dep']]]]).toarray()
            ground_t_v.append(np.concatenate([pos_vec, dep_vec, vectors[tok_i].reshape(1,-1)], axis=1)) 
            ground_t_y.append(lb)

    return ground_t_v, ground_t_y

In [151]:
ground_t_v, ground_t_y = features_extraction(matches, doc)
ground_t_y

['PERSON', 'PERSON', 'PERSON', 'GPE', 'Money', 'Money']

In [168]:
len(ground_t_v)

6

In [153]:
# TensorFlow and tf.keras
import tensorflow as tf

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

print(tf.__version__)

2.8.0


In [207]:
model = tf.keras.Sequential([tf.keras.layers.InputLayer(input_shape=(ground_t_v[0].shape[1],)),
    tf.keras.layers.Dense(10000+len(ground_t_y),use_bias = False)
])

In [208]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [209]:
layer = model.layers[0]
layer.get_weights()[0].shape

(224, 10006)

In [210]:
model.input_shape

(None, 224)

In [211]:
np.asarray(ground_t_v).reshape((-1,224))

array([[ 0.        ,  0.        ,  0.        , ..., -0.26338011,
        -1.39262593, -0.44705266],
       [ 0.        ,  0.        ,  0.        , ...,  0.28927583,
        -2.07036352, -0.5911411 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.11520132,
         0.13569205,  1.34147763],
       [ 0.        ,  0.        ,  0.        , ..., -1.22081852,
         0.51961964,  0.5569582 ],
       [ 0.        ,  0.        ,  0.        , ...,  0.92227352,
        -1.03889906, -0.76376516],
       [ 0.        ,  0.        ,  0.        , ..., -0.99378419,
         0.80901015,  0.86824536]])

In [212]:
import numpy as np
new_data = np.random.rand(ground_t_v[0].shape[1],10000)
w_list = [new_data]
for v in ground_t_v:
    w1 = v.reshape((-1,1))
    w_list.append(w1)
weight = np.concatenate(w_list,axis= 1)

layer.set_weights([weight])

In [213]:
data = model.predict(np.asarray(ground_t_v).reshape((-1,224)))

In [214]:
probability_model = tf.keras.Sequential([model, 
                                         tf.keras.layers.Softmax()])

In [228]:
predictions = probability_model.predict(np.asarray(ground_t_v).reshape((-1,224)))

In [234]:
np.argmax(predictions[5])

10005

# New Section