# M2 L1-Exercise

In [1]:
import nltk
from nltk.corpus import stopwords  # Fill in the blank
from nltk.stem import PorterStemmer, WordNetLemmatizer  # Fill in the blank
from nltk.tokenize import word_tokenize  # Fill in the blank
import re
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text
text = """
Natural language processing (NLP) is a subfield of linguistics, computer 
science, and artificial intelligence. 
It involves the interactions between computers and humans using the natural 
language. The ultimate objective 
of NLP is to read, decipher, understand, and make sense of the human language in
a valuable way. It started 
in the 1950s, although work can be found from earlier periods. In 1950, Alan 
Turing published an article titled 
"Computing Machinery and Intelligence" which proposed what is now called the 
Turing test as a criterion of 
intelligence, a task that involves the automated interpretation and generation 
of natural language, but at the 
time not articulated as a problem separate from artificial intelligence. The 
premise of symbolic NLP is 
well-summarized by John Searle's Chinese room experiment: Given a collection of 
rules (e.g., a Chinese phrasebook, 
with questions and matching answers), the computer emulates natural language 
understanding (or other NLP tasks) 
by applying those rules to the data it is confronted with. 2023 is the year when
NLP got its major breakthrough.
"""

# Task 1: Tokenization
# Write a function to tokenize the text and return the tokens for first line in the paragraph
def tokenize_text(text):
    sentences = nltk.sent_tokenize(text)
    first_line = sentences[0]
    tokens = word_tokenize(first_line)
    return tokens

# Task 2: Stop Word Removal
# Write a function to remove stop words for the whole paragraph from the tokens and return the filtered tokens
def remove_stop_words(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

# Task 3: Stemming
# Write a function to perform stemming on the filtered tokens and return the stemmed tokens
def perform_stemming(filtered_tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    return stemmed_tokens

# Task 4: Lemmatization
# Write a function to perform lemmatization on the filtered tokens and return the lemmatized tokens
def perform_lemmatization(filtered_tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    return lemmatized_tokens

# Now use the functions to process the text
tokens = tokenize_text(text)
filtered_tokens = remove_stop_words(tokens)
stemmed_tokens = perform_stemming(filtered_tokens)
lemmatized_tokens = perform_lemmatization(filtered_tokens)

print("Tokens:", tokens)
print("Filtered Tokens:", filtered_tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', '.']
Filtered Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial', 'intelligence', '.']
Stemmed Tokens: ['natur', 'languag', 'process', '(', 'nlp', ')', 'subfield', 'linguist', ',', 'comput', 'scienc', ',', 'artifici', 'intellig', '.']
Lemmatized Tokens: ['Natural', 'language', 'processing', '(', 'NLP', ')', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial', 'intelligence', '.']


# M2 L2-Exercise

In [None]:
Question 1: Co-occurrence Matrix
To create a co-occurrence (word-word) matrix for the given sentences, we need to determine the context in which words co-occur. First, let's define the co-occurrence matrix using the context of a sentence. Then, we'll redefine it using a window of three words (one on the left and one on the right).

Sentences:
'Apples are green and red.'
'Red apples are sweet.'
'Green oranges are sour.'
Context: Sentence
The words are: apples, are, green, and, red, sweet, oranges, sour.

In [None]:
Co-occurrence Matrix (Context: Sentence)

In [None]:
         apples   are  green    and    red  sweet  oranges  sour
apples        0     2      1      1      1      1        0     0
are           2     0      2      1      1      1        1     1
green         1     2      0      1      1      0        1     1
and           1     1      1      0      1      0        0     0
red           1     1      1      1      0      1        0     0
sweet         1     1      0      0      1      0        0     0
oranges       0     1      1      0      0      0        0     1
sour          0     1      1      0      0      0        1     0


In [None]:
Co-occurrence Matrix (Context: Window of Three Words)

In [None]:
         apples   are  green    and    red  sweet  oranges  sour
apples        0     1      1      0      1      1        0     0
are           1     0      1      1      1      1        1     0
green         1     1      0      1      0      0        1     1
and           0     1      1      0      1      0        0     0
red           1     1      0      1      0      1        0     0
sweet         1     1      0      0      1      0        0     0
oranges       0     1      1      0      0      0        0     1
sour          0     0      1      0      0      0        1     0


In [None]:
Question 2: TF-IDF Calculation
First, let's manually calculate TF-IDF for the provided corpus.

Corpus:
"the cat sat on the mat"
"the dog sat on the log"
"cats and dogs are great"
Step-by-Step Calculation
1. Term Frequency (TF)
TF for each word in each document:

In [None]:
Document 1: {'the': 2/6, 'cat': 1/6, 'sat': 1/6, 'on': 1/6, 'mat': 1/6}
Document 2: {'the': 2/6, 'dog': 1/6, 'sat': 1/6, 'on': 1/6, 'log': 1/6}
Document 3: {'cats': 1/5, 'and': 1/5, 'dogs': 1/5, 'are': 1/5, 'great': 1/5}

In [None]:
2. Inverse Document Frequency (IDF)
IDF for each word:

In [None]:
IDF = log(N / df) where N is the total number of documents and df is the number of documents containing the word.

the: log(3/2)
cat: log(3/1)
sat: log(3/2)
on: log(3/2)
mat: log(3/1)
dog: log(3/1)
log: log(3/1)
cats: log(3/1)
and: log(3/1)
dogs: log(3/1)
are: log(3/1)
great: log(3/1)

In [None]:
3. TF-IDF Calculation
TF-IDF for each word in each document:

In [None]:
Document 1: {'the': TF*IDF, 'cat': TF*IDF, 'sat': TF*IDF, 'on': TF*IDF, 'mat': TF*IDF}
Document 2: {'the': TF*IDF, 'dog': TF*IDF, 'sat': TF*IDF, 'on': TF*IDF, 'log': TF*IDF}
Document 3: {'cats': TF*IDF, 'and': TF*IDF, 'dogs': TF*IDF, 'are': TF*IDF, 'great': TF*IDF}

In [2]:
import pandas as pd
import numpy as np

### Document
corpus = ["the cat sat on the mat", "the dog sat on the log", "cats and dogs are great"]

## Word set of the corpus
words_set = set()
for doc in corpus:
    words = doc.split(' ')
    words_set = words_set.union(set(words))

print('Number of words in the corpus:', len(words_set))
print('The words in the corpus:\n', words_set)

#### TF
n_docs = len(corpus)  # Number of documents in the corpus
n_words_set = len(words_set)  # Number of unique words in the corpus
df_tf = pd.DataFrame(np.zeros((n_docs, n_words_set)), columns=list(words_set))

# Compute Term Frequency (TF)
for i in range(n_docs):
    words = corpus[i].split(' ')  # Words in the document
    for w in words:
        df_tf.loc[i, w] += 1 / len(words)

print("TF Matrix:")
print(df_tf)

### Compute IDF
print("IDF of: ")
idf = {}
for w in words_set:
    k = 0  # number of documents in the corpus that contain this word
    for i in range(n_docs):
        if w in corpus[i].split(' '):
            k += 1
    idf[w] = np.log10(n_docs / k)
    print(f'{w:>15}: {idf[w]:>10}')

### Compute TF-IDF
df_tfidf = df_tf.copy()
for w in words_set:
    df_tfidf[w] = df_tf[w] * idf[w]

print("TF-IDF Matrix:")
print(df_tfidf)


Number of words in the corpus: 12
The words in the corpus:
 {'log', 'great', 'are', 'sat', 'dog', 'on', 'mat', 'and', 'cats', 'dogs', 'the', 'cat'}
TF Matrix:
        log  great  are       sat       dog        on       mat  and  cats  \
0  0.000000    0.0  0.0  0.166667  0.000000  0.166667  0.166667  0.0   0.0   
1  0.166667    0.0  0.0  0.166667  0.166667  0.166667  0.000000  0.0   0.0   
2  0.000000    0.2  0.2  0.000000  0.000000  0.000000  0.000000  0.2   0.2   

   dogs       the       cat  
0   0.0  0.333333  0.166667  
1   0.0  0.333333  0.000000  
2   0.2  0.000000  0.000000  
IDF of: 
            log: 0.47712125471966244
          great: 0.47712125471966244
            are: 0.47712125471966244
            sat: 0.17609125905568124
            dog: 0.47712125471966244
             on: 0.17609125905568124
            mat: 0.47712125471966244
            and: 0.47712125471966244
           cats: 0.47712125471966244
           dogs: 0.47712125471966244
            the: 0.1760912590