In [1]:
!pip install spacy
!pip install pandas
!pip install -U pip setuptools wheel
!python -m spacy download en_core_web_sm
!pip install openpyxl
!pip install scikit-learn
!pip install gensim

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')



[notice] A new release of pip is available: 24.0 -> 24.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import spacy
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from scipy.sparse import lil_matrix

### **Preprocessing the text**
The steps will be: 
* Tokenization
* Lemmatization
* Cleaning the Lemmatized tokens

In [3]:
nlp = spacy.load('en_core_web_sm')

*Functions (using spaCy) for Tokenization, Lemmatization & Cleaning*

In [4]:
def tokenize_spacy(text):
    doc = nlp(text)
    return [token.text for token in doc]

def lemmatize_spacy(text):
    doc = nlp(text)
    return [token.lemma_ for token in doc]

def clean_text_spacy(tokens):
    cleaned_tokens = [token.lower() for token in tokens if token.isalpha()]
    return cleaned_tokens

*We will be using a Whatsapp conversation as our corpus*

In [5]:
with open('_chat.txt', 'r', encoding='utf-8') as file:
    content = file.read()

data = []
for row in content.split('\n'):
    data.append(row[row.rfind(':') + 1:])

whatsapp_df = pd.DataFrame({"whatsapp_text":data})

*Applying Tokenization*

In [6]:
whatsapp_df['scraped_tokens'] = whatsapp_df['whatsapp_text'].apply(tokenize_spacy)

*Applying Lemmatization*

In [7]:
whatsapp_df['scraped_lemmatization'] = whatsapp_df['whatsapp_text'].apply(lemmatize_spacy)

*Cleaning the lemmatized tokens*

In [8]:
whatsapp_df['cleaned_tokens'] = whatsapp_df['scraped_lemmatization'].apply(clean_text_spacy)

### **Remove Stop Words**

In [9]:
def remove_stop_words(tokens):
    stop_words = nlp.Defaults.stop_words
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [10]:
whatsapp_df['filtered_tokens'] = whatsapp_df['cleaned_tokens'].apply(remove_stop_words)

In [11]:
whatsapp_df_copy = whatsapp_df.copy()
whatsapp_df_copy['filtered_tokens'] = whatsapp_df_copy['filtered_tokens'].apply(lambda x: ' '.join(x))
whatsapp_df_copy.to_excel('whatsapp_df.xlsx')
whatsapp_df_copy.head()

Unnamed: 0,whatsapp_text,scraped_tokens,scraped_lemmatization,cleaned_tokens,filtered_tokens
0,‎Messages and calls are end-to-end encrypted....,"[ , ‎Messages, and, calls, are, end, -, to, -,...","[ , ‎message, and, call, be, end, -, to, -, en...","[and, call, be, end, to, end, encrypt, no, one...",end end encrypt outside chat whatsapp read listen
1,‎You created group “מדברים באנגלית”,"[ , ‎You, created, group, “, מדברים, באנגלית, ”]","[ , ‎you, create, group, "", מדברים, באנגלית, ""]","[create, group, מדברים, באנגלית]",create group מדברים באנגלית
2,hey noam how are you?,"[ , hey, noam, how, are, you, ?]","[ , hey, noam, how, be, you, ?]","[hey, noam, how, be, you]",hey noam
3,Hey!! I’m good thanks how about you,"[ , Hey, !, !, I, ’m, good, thanks, how, about...","[ , Hey, !, !, I, ’m, good, thank, how, about,...","[hey, i, good, thank, how, about, you]",hey good thank
4,What are you going to do today,"[ , What, are, you, going, to, do, today]","[ , what, be, you, go, to, do, today]","[what, be, you, go, to, do, today]",today


### **Applying Feature Extraction by the following algorithms:**
* BOW
* TF-IDF
* Word embedding by WORD2VEC

**Bow algorithm**

In [12]:
def bow_extraction(texts):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer.get_feature_names_out()

In [13]:
bow_features, bow_feature_names = bow_extraction(whatsapp_df_copy['filtered_tokens'])
print("BoW Features:\n", bow_features)
print("Feature Names:\n", bow_feature_names)

BoW Features:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 1]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Feature Names:
 ['actually' 'add' 'afeka' 'ai' 'amazing' 'analysis' 'anytime' 'article'
 'ask' 'awesome' 'backend' 'bad' 'balance' 'balcony' 'basil' 'beach'
 'beautiful' 'believe' 'big' 'bit' 'board' 'book' 'brightly' 'bump'
 'bunch' 'busy' 'care' 'case' 'catch' 'challenge' 'change' 'character'
 'chat' 'check' 'child' 'clever' 'coffee' 'come' 'conversation' 'cooking'
 'cool' 'couple' 'course' 'cream' 'create' 'curious' 'currently' 'day'
 'debug' 'deck' 'deep' 'definitely' 'detective' 'development' 'difference'
 'dive' 'drag' 'early' 'easy' 'eat' 'edit' 'emma' 'encrypt' 'end'
 'enhance' 'excited' 'exercise' 'exist' 'exit' 'fantastic' 'far'
 'fascinating' 'finally' 'finish' 'forget' 'forward' 'fresh' 'friend'
 'fun' 'game' 'gang' 'garden' 'gardening' 'generate' 'good' 'got' 'gpt'
 'great' 'gripping' 'ground' 'group' 'grow' 'haha' 'hand' 'hangover'
 'harar

**TF-IDF algorithm**

In [14]:
def tfidf_extraction(texts):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(texts)
    return X.toarray(), vectorizer.get_feature_names_out()

In [15]:
tfidf_features, tfidf_feature_names = tfidf_extraction(whatsapp_df_copy['filtered_tokens'])
print("TF-IDF Features:\n", tfidf_features)
print("Feature Names:\n", tfidf_feature_names)

TF-IDF Features:
 [[0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.5 0.5]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]
Feature Names:
 ['actually' 'add' 'afeka' 'ai' 'amazing' 'analysis' 'anytime' 'article'
 'ask' 'awesome' 'backend' 'bad' 'balance' 'balcony' 'basil' 'beach'
 'beautiful' 'believe' 'big' 'bit' 'board' 'book' 'brightly' 'bump'
 'bunch' 'busy' 'care' 'case' 'catch' 'challenge' 'change' 'character'
 'chat' 'check' 'child' 'clever' 'coffee' 'come' 'conversation' 'cooking'
 'cool' 'couple' 'course' 'cream' 'create' 'curious' 'currently' 'day'
 'debug' 'deck' 'deep' 'definitely' 'detective' 'development' 'difference'
 'dive' 'drag' 'early' 'easy' 'eat' 'edit' 'emma' 'encrypt' 'end'
 'enhance' 'excited' 'exercise' 'exist' 'exit' 'fantastic' 'far'
 'fascinating' 'finally' 'finish' 'forget' 'forward' 'fresh' 'friend'
 'fun' 'game' 'gang' 'garden' 'gardening' 'generate' 'good' 'got' 'gpt'
 

**Word Embeddings by Word2Vec**

In [16]:
def word2vec_extraction(tokens_list, vector_size=100, window=5, min_count=1, workers=4):
    model = Word2Vec(sentences=tokens_list, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

In [17]:
tokenized_texts = [text.split() for text in whatsapp_df_copy['filtered_tokens']]

word2vec_model = word2vec_extraction(tokenized_texts)
similar_words = word2vec_model.wv.most_similar('finish')
print(similar_words)

[('relax', 0.22336749732494354), ('enhance', 0.22177574038505554), ('system', 0.19541819393634796), ('forget', 0.19149678945541382), ('miss', 0.19064918160438538), ('character', 0.1868743598461151), ('kind', 0.17510007321834564), ('noah', 0.1748739331960678), ('plus', 0.1747390180826187), ('debug', 0.16881349682807922)]


## **Glove Explanation**

GloVe (Global Vectors for Word Representation) is an unsupervised learning algorithm for obtaining vector representations (embeddings) for words. These embeddings capture semantic relationships between words based on their co-occurrence statistics in a large corpus of text. Unlike Word2Vec, which learns embeddings by predicting context words given a target word (skip-gram model) or predicting a target word given context words (continuous bag of words model), GloVe learns embeddings by factorizing the word co-occurrence matrix.

Explanation of GloVe:
* Co-occurrence Matrix: GloVe starts with a word-word co-occurrence matrix where each element 
𝑋𝑖𝑗 ​represents how often word 𝑗 appears in the context of word 𝑖 within a fixed window size.

* Objective: The goal of GloVe is to learn word embeddings such that the dot product of two word vectors corresponds to the logarithm of their co-occurrence probability.

* Training: GloVe uses stochastic gradient descent to minimize a loss function that measures the discrepancy between the dot product of word vectors and the logarithm of their co-occurrence probabilities.

* Advantages: GloVe embeddings tend to capture global semantic meanings better than some other models, and they often perform well in tasks requiring understanding of word relationships and analogies.

In [18]:
def build_cooccurrence_matrix(corpus, window_size=2):
    vocab = list(set(corpus))
    word_to_id = {word: i for i, word in enumerate(vocab)}
    cooccurrence = lil_matrix((len(vocab), len(vocab)), dtype=np.float64)
    
    for i, word in enumerate(corpus):
        left_context = max(0, i - window_size)
        right_context = min(len(corpus), i + window_size + 1)
        for j in range(left_context, right_context):
            if i != j:
                cooccurrence[word_to_id[word], word_to_id[corpus[j]]] += 1
    
    return cooccurrence.tocsr(), word_to_id

def glove_loss(X, W, U, b, c):
    diff = W.dot(U.T) + b[:, np.newaxis] + c[np.newaxis, :] - np.log(X.toarray() + 1)
    squared_error = np.sum(diff ** 2)
    return squared_error

def train_glove(X, vector_size=50, iterations=50, learning_rate=0.01):
    vocab_size = X.shape[0]
    W = np.random.randn(vocab_size, vector_size) / np.sqrt(vector_size)
    U = np.random.randn(vocab_size, vector_size) / np.sqrt(vector_size)
    b = np.zeros(vocab_size)
    c = np.zeros(vocab_size)
    
    for iteration in range(iterations):
        error = glove_loss(X, W, U, b, c)
        if iteration % 10 == 0:
            print(f"Iteration {iteration}: Loss = {error}")
        
        # Compute gradients
        diff = W.dot(U.T) + b[:, np.newaxis] + c[np.newaxis, :] - np.log(X.toarray() + 1)
        grad_W = 2 * diff.dot(U)
        grad_U = 2 * diff.T.dot(W)
        grad_b = 2 * np.sum(diff, axis=1)
        grad_c = 2 * np.sum(diff, axis=0)
                
        # Update parameters
        W -= learning_rate * grad_W
        U -= learning_rate * grad_U
        b -= learning_rate * grad_b
        c -= learning_rate * grad_c
    
    return (W + U) / 2

In [19]:
corpus = []
for sentence in whatsapp_df_copy['filtered_tokens']:
    for word in sentence.split(' '):
        corpus.append(word)

In [20]:
X, word_to_id = build_cooccurrence_matrix(corpus)
word_vectors = train_glove(X, vector_size=5, iterations=1000, learning_rate=0.001)

Iteration 0: Loss = 11414.258235861776
Iteration 10: Loss = 1773.8906061943587
Iteration 20: Loss = 1047.721799588378
Iteration 30: Loss = 845.3246935657834
Iteration 40: Loss = 763.6390937247872
Iteration 50: Loss = 724.0768571525452
Iteration 60: Loss = 702.6386831398901
Iteration 70: Loss = 690.0384624330482
Iteration 80: Loss = 682.123644769618
Iteration 90: Loss = 676.8473623406321
Iteration 100: Loss = 673.1264222975162
Iteration 110: Loss = 670.3551014076551
Iteration 120: Loss = 668.1791383514158
Iteration 130: Loss = 666.3833689473868
Iteration 140: Loss = 664.8327842692302
Iteration 150: Loss = 663.4402272405408
Iteration 160: Loss = 662.148033081095
Iteration 170: Loss = 660.9172701298826
Iteration 180: Loss = 659.721265793809
Iteration 190: Loss = 658.5416173176307
Iteration 200: Loss = 657.3656770642715
Iteration 210: Loss = 656.1849289902196
Iteration 220: Loss = 654.99391113413
Iteration 230: Loss = 653.789475549991
Iteration 240: Loss = 652.5702576436942
Iteration 250: 

In [21]:
for word, idx in word_to_id.items():
    print(f"{word}: {word_vectors[idx]}")

: [ 0.04789519  0.30382639 -0.09923063 -0.25312513  0.43519085]
exercise: [ 0.02962981 -0.12055649 -0.01059242 -0.01917287 -0.07605236]
night: [ 0.16212151 -0.11735411  0.05233699  0.52436645  0.04550326]
afeka: [-0.03367665  0.05915731  0.0112688  -0.13222823 -0.15245988]
enhance: [ 0.03365651  0.07206294 -0.13423854  0.01586545  0.07526636]
stranger: [ 0.10147379 -0.09215681  0.14796505  0.10972085 -0.00431718]
watch: [ 0.10209609 -0.1962361   0.2201643  -0.05758187  0.14770639]
child: [ 0.01700708  0.09476419 -0.03741049 -0.01666573 -0.11547033]
emma: [-0.14845511  0.12660885 -0.01732322  0.01644653 -0.04992814]
exit: [ 0.05176565 -0.00240063 -0.04177029 -0.05009683 -0.11349467]
pick: [-0.11530769 -0.08024269 -0.10124028 -0.11311989 -0.10233174]
case: [-0.10860768  0.13060947 -0.10704555  0.03540738 -0.10441912]
sam: [-0.25149985 -0.22289227 -0.05159493 -0.06136898  0.02696319]
testing: [ 0.0377227   0.12268377  0.02327187  0.17354591 -0.03540093]
sound: [-0.2366588   0.07302166  0.

### **Explanation of applying GloVe:**
*Loss Over Iterations:*
The loss values decrease over iterations, indicating that the model is learning and optimizing the word vectors effectively.
The loss decreases significantly in the early iterations and continues to decrease at a slower rate as training progresses. This indicates that the model is converging towards an optimal set of word vectors.

*Word Vectors:*
The final word vectors are the main output of the GloVe algorithm. Each word in your dataset is represented by a vector in a multi-dimensional space.
Each vector captures the semantic properties of the corresponding word. Words with similar meanings or usage patterns tend to have similar vector representations.

### **Applying tagging by CYK**

We are asked to apply the CYK tagging to 5 sentences, 1 manually and 4 with code.
The manual cyk tagging is in the attached word file. 

In [22]:
def cyk_parse(sentence, grammar):
    # Step 1: Tokenization
    tokens = sentence.split()
    n = len(tokens)
    table = [[set() for _ in range(n+1)] for _ in range(n+1)]
	
    # Step 2: Initialization
    for i in range(1, n+1):
        for rule in grammar:
            if rule[1] == tokens[i-1]:
                table[i][i].add(rule[0])

    # Step 3: Rule Application
    for length in range(2, n+1):
        for i in range(1, n-length+2):
            j = i + length - 1
            for k in range(i, j):
                for rule in grammar:
                    if len(rule) == 3:
                        for left in table[i][k]:
                            for right in table[k+1][j]:
                                if rule[1] in left and rule[2] in right:
                                    table[i][j].add(rule[0])


    # Step 4: Backtracking
    if 'S' in table[1][n]:
        return True, table
    else:
        return False, table

In [23]:
# sentences that will be parsed
sentence_1 = whatsapp_df_copy['whatsapp_text'][19] # early today was a beautiful day
sentence_2 = whatsapp_df_copy['whatsapp_text'][20] # the sun shone brightly warming the ground
sentence_3 = whatsapp_df_copy['whatsapp_text'][21] # I went to the beach with a friend
sentence_4 = whatsapp_df_copy['whatsapp_text'][22] # the children ate ice-cream

sentences = [sentence_1, sentence_2, sentence_3, sentence_4]

In [24]:
# Define the context-free grammar in CNF
grammar_1 = [
    ('S', 'NP', 'VP'),
    ('NP', 'RB', 'NN'),
    ('NP', 'DT', 'NPA'),
    ('NPA', 'JJ', 'NN'),
    ('RB', 'early'),
    ('NN', 'today'),
    ('VP', 'VBD', 'NP'),
    ('VBD', 'was'),
    ('DT', 'a'),
    ('JJ', 'beautiful'),
    ('NN', 'day')
]

grammar_2 = [
    ('S', 'NP', 'VP'),
    ('NP', 'DT', 'NN'),
    ('DT', 'the'),
    ('NN', 'sun'),
    ('VP', 'VBD', 'S'),
    ('VBD', 'shone'),
    ('S', 'RB', 'VP'),
    ('RB', 'brightly'),
    ('VP', 'VBG', 'NP'),
    ('VBG', 'warming'),
    ('DT', 'the'),
    ('NN', 'ground')
]

grammar_3 = [
    ('S', 'NP', 'VP'),
    ('NP', 'I'),
    ('VP', 'VBD', 'VP_0'),
    ('VP_0', 'PP', 'PP'),
    ('VBD', 'went'),
    ('PP', 'IN', 'NP'),
    ('IN', 'to'),
    ('IN', 'with'),
    ('NP', 'DT', 'NN'),
    ('DT', 'the'),
    ('DT', 'a'),
    ('NN', 'beach'),
    ('NN', 'friend')
]

grammar_4 = [
    ('S', 'NP', 'VP'),
    ('NP', 'DT', 'NN'),
    ('NN', 'children'),
    ('DT', 'the'),
    ('VP', 'VBD', 'NN'),
    ('VBD', 'ate'),
    ('NN', 'ice-cream')
]

grammars = [grammar_1, grammar_2, grammar_3, grammar_4]

In [25]:
# Call the CYK parser for each sentence
for sentence, grammar in zip(sentences, grammars):
    parsed, table = cyk_parse(sentence, grammar)

    # Print the parse table and whether the sentence was parsed or not
    if parsed:
        print("Input sentence: ", sentence)
        print("Parse table: ")
        for row in table:
            print(row)
    else:
        print("Input sentence: ", sentence)
        print("Sentence not parsed.")
    print('\n')

Input sentence:   early today was a beautiful day
Parse table: 
[set(), set(), set(), set(), set(), set(), set()]
[set(), {'RB'}, {'NP'}, set(), set(), set(), {'S'}]
[set(), set(), {'NN'}, set(), set(), set(), set()]
[set(), set(), set(), {'VBD'}, set(), set(), {'VP'}]
[set(), set(), set(), set(), {'DT'}, set(), {'NP'}]
[set(), set(), set(), set(), set(), {'JJ'}, {'NPA'}]
[set(), set(), set(), set(), set(), set(), {'NN'}]


Input sentence:   the sun shone brightly warming the ground
Parse table: 
[set(), set(), set(), set(), set(), set(), set(), set()]
[set(), {'DT'}, {'NP'}, set(), set(), set(), set(), {'S'}]
[set(), set(), {'NN'}, set(), set(), set(), set(), set()]
[set(), set(), set(), {'VBD'}, set(), set(), set(), {'VP'}]
[set(), set(), set(), set(), {'RB'}, set(), set(), {'S'}]
[set(), set(), set(), set(), set(), {'VBG'}, set(), {'VP'}]
[set(), set(), set(), set(), set(), set(), {'DT'}, {'NP'}]
[set(), set(), set(), set(), set(), set(), set(), {'NN'}]


Input sentence:   I went to