# HW1

In [19]:
# Install necessary libraries
!pip install nltk spacy beautifulsoup4 kaggle
!python -m spacy download en_core_web_sm
!python -m spacy download he_core_news_sm

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer,SnowballStemmer
from nltk.corpus import stopwords
import spacy
from spacy.lang.en import English
from bs4 import BeautifulSoup
import pandas as pd
import requests
from collections import Counter



Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.

[38;5;1m✘ No compatible package found for 'he_core_news_sm' (spaCy v3.7.5)[0m



In [2]:
# Initialize nltk components
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load spaCy models
nlp_spacy = spacy.load('en_core_web_sm')
# nlp_spacy_he = spacy.load('he_core_news_sm')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### Data Loading & Basic Analysis

In [3]:
spam_data = pd.read_csv('spam.csv', encoding='ISO-8859-1')
spam_data.columns = ['label', 'text', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
spam_data = spam_data[['label', 'text']]

In [4]:
# Print basic statistics on the data
total_messages = len(spam_data)
spam_messages = len(spam_data[spam_data['label'] == 'spam'])
ham_messages = len(spam_data[spam_data['label'] == 'ham'])

word_counts = spam_data['text'].apply(lambda x: len(word_tokenize(x)))
avg_words_per_message = word_counts.mean()

all_words = nltk.FreqDist(word.lower() for message in spam_data['text'] for word in word_tokenize(message))
#most_frequent_words = all_words.most_common(5)
words_only_once = sum(1 for count in all_words.values() if count == 1)


def average_words_per_message(df):
    """Calculate and print the average number of words per message."""
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    average_words = df['word_count'].mean()
    print(f"Average number of words per message: {average_words:.2f}")
    return average_words


def most_frequent_words_and_total_word_count(words, n=5):
    """Calculate and print the n most frequent words."""
    # Count the total number of words
    total_words = len(words)
    # Print the total number of words
    print(f"Total number of words: {total_words}")
    # Count the frequency of each word in the list
    word_counts = Counter(words)
    # Get the n most common words
    most_common_words = word_counts.most_common(n)
     # Print the top n most frequent words
    print(f"Top {n} most frequent words:")
    for word, count in most_common_words:
         print(f"{word}: {count}")
    return most_common_words

def words_appearing_once(df):
    """Calculate and print the number of words that appear only once."""
    all_words = ' '.join(df['text']).split()
    word_counts = Counter(all_words)
    words_once = [word for word, count in word_counts.items() if count == 1]
    num_words_once = len(words_once)
    print(f"Number of words that appear only once: {num_words_once}")
    return num_words_once

all_words = ' '.join(spam_data['text']).split()
most_common_words = most_frequent_words_and_total_word_count(all_words)
average_words = average_words_per_message(spam_data)
words_only_once = words_appearing_once(spam_data)

print(f"Total number of SMS messages: {total_messages}")
print(f"Number of spam messages: {spam_messages}")
print(f"Number of ham messages: {ham_messages}")
print(f"Number of words that only appear once: {words_only_once}")


Total number of words: 86335
Top 5 most frequent words:
to: 2134
you: 1622
I: 1466
a: 1327
the: 1197
Average number of words per message: 15.49
Number of words that appear only once: 9268
Total number of SMS messages: 5572
Number of spam messages: 747
Number of ham messages: 4825
Number of words that only appear once: 9268


### Text Processing

In [5]:
import time
import nltk
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer

# Download NLTK punkt tokenizer if not already downloaded
nltk.download('punkt')

# Initialize spaCy English tokenizer outside the function
nlp = English()
tokenizer_spacy = Tokenizer(nlp.vocab)

def tokenize_nltk(text, verbose=False):
    # Measure the start time
    start_time = time.time()

    # Tokenize the text using NLTK
    tokens = nltk.word_tokenize(text)

    # Measure the end time
    end_time = time.time()

    if verbose:
        # Print the execution time
        print("NLTK tokenize time:", end_time - start_time, "seconds")

    return tokens

def tokenize_spacy(text, verbose=False):
    # Measure the start time
    start_time = time.time()

    # Tokenize the text using spaCy
    tokens = [token.text for token in tokenizer_spacy(text)]

    # Measure the end time
    end_time = time.time()

    if verbose:
        # Print the execution time
        print("spaCy tokenize time:", end_time - start_time, "seconds")

    return tokens

# Example usage:
text = spam_data['text'].str.cat(sep=' ')
tokens_nltk = tokenize_nltk(text, verbose=True)
print(tokens_nltk[:200])
most_common_words = most_frequent_words_and_total_word_count(tokens_nltk)

print("--------------------------")
tokens_spacy = tokenize_spacy(text, verbose=True)
print(tokens_spacy[:200])
most_common_words = most_frequent_words_and_total_word_count(tokens_spacy)

# Apply tokenization to DataFrame without verbose output
spam_data['tokens_nltk'] = spam_data['text'].apply(lambda x: tokenize_nltk(x, verbose=False))
spam_data['tokens_spacy'] = spam_data['text'].apply(lambda x: tokenize_spacy(x, verbose=False))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


NLTK tokenize time: 1.1792049407958984 seconds
['Go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'Available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'Cine', 'there', 'got', 'amore', 'wat', '...', 'Ok', 'lar', '...', 'Joking', 'wif', 'u', 'oni', '...', 'Free', 'entry', 'in', '2', 'a', 'wkly', 'comp', 'to', 'win', 'FA', 'Cup', 'final', 'tkts', '21st', 'May', '2005', '.', 'Text', 'FA', 'to', '87121', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 'T', '&', 'C', "'s", 'apply', '08452810075over18', "'s", 'U', 'dun', 'say', 'so', 'early', 'hor', '...', 'U', 'c', 'already', 'then', 'say', '...', 'Nah', 'I', 'do', "n't", 'think', 'he', 'goes', 'to', 'usf', ',', 'he', 'lives', 'around', 'here', 'though', 'FreeMsg', 'Hey', 'there', 'darling', 'it', "'s", 'been', '3', 'week', "'s", 'now', 'and', 'no', 'word', 'back', '!', 'I', "'d", 'like', 'some', 'fun', 'you', 'up', 'for', 'it', 'still', '?', 'Tb', 'ok', '!', 'XxX', 'std', 'chgs', '

We observe that spaCy is significantly more efficient and less time-consuming than NLTK. One of the reasons for this efficiency is the optimized nature of spaCy's tokenization rules, which are designed for speed. In terms of time complexity, both methods operate at O(n).
Additionally, the output from NLTK and spaCy differs slightly.

In [6]:
# Lemmatize the SMS text using nltk and spaCy
lemmatizer = WordNetLemmatizer()

def lemmatize_nltk(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

def lemmatize_spacy(text):
    return [token.lemma_ for token in nlp_spacy(text)]

spam_data['lemmas_nltk'] = spam_data['tokens_nltk'].apply(lemmatize_nltk)
spam_data['lemmas_spacy'] = spam_data['text'].apply(lemmatize_spacy)

In [7]:
# Stem the SMS text using nltk and spaCy
stemmer = PorterStemmer()

def stem_nltk(tokens):
    return [stemmer.stem(token) for token in tokens]

def stem_spacy(text):
    return [token.lemma_ for token in nlp_spacy(text)]  # Using lemma as spaCy doesn't have a built-in stemmer

spam_data['stems_nltk'] = spam_data['tokens_nltk'].apply(stem_nltk)
spam_data['stems_spacy'] = spam_data['text'].apply(stem_spacy)


In [8]:
# Comparison of nltk and spaCy implementations

def updated_statistics(tokens):
    all_words = nltk.FreqDist(word.lower() for word_list in tokens for word in word_list)
    most_frequent_words = all_words.most_common(5)
    words_only_once = sum(1 for count in all_words.values() if count == 1)
    return most_frequent_words, words_only_once

print("Updated statistics after tokenization (nltk):", updated_statistics(spam_data['tokens_nltk']))
print("Updated statistics after lemmatization (nltk):", updated_statistics(spam_data['lemmas_nltk']))
print("Updated statistics after stemming (nltk):", updated_statistics(spam_data['stems_nltk']))
print("Updated statistics after tokenization (spaCy):", updated_statistics(spam_data['tokens_spacy']))
print("Updated statistics after lemmatization (spaCy):", updated_statistics(spam_data['lemmas_spacy']))
print("Updated statistics after stemming (spaCy):", updated_statistics(spam_data['stems_spacy']))


Updated statistics after tokenization (nltk): ([('.', 4886), ('i', 2900), ('to', 2241), ('you', 2228), (',', 1871)], 4992)
Updated statistics after lemmatization (nltk): ([('.', 4886), ('i', 2900), ('to', 2241), ('you', 2228), (',', 1871)], 4779)
Updated statistics after stemming (nltk): ([('.', 4886), ('i', 2900), ('to', 2241), ('you', 2228), (',', 1871)], 4179)
Updated statistics after tokenization (spaCy): ([('to', 2226), ('i', 2208), ('you', 1917), ('a', 1419), ('the', 1317)], 7989)
Updated statistics after lemmatization (spaCy): ([('.', 4945), ('i', 3741), ('be', 3260), ('to', 2309), ('you', 2217)], 4583)
Updated statistics after stemming (spaCy): ([('.', 4945), ('i', 3741), ('be', 3260), ('to', 2309), ('you', 2217)], 4583)


### Web Scraping

In [9]:
# Use BeautifulSoup to scrape text data from a public page on one of your social media profiles.
url = 'https://en.wikipedia.org/wiki/English_Springer_Spaniel'
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
print(soup.title)

scraped_text = ' '.join(p.get_text() for p in soup.find_all('p'))
print("Scraped Text:", scraped_text[:500])  # Print the first 500 characters to verify



<title>English Springer Spaniel - Wikipedia</title>
Scraped Text: 
 The English Springer Spaniel is a breed of gun dog in the Spaniel group traditionally used for flushing and retrieving game. They are descended from the Norfolk or Shropshire Spaniels of the mid-19th century; the breed has diverged into separate show and working lines. It is closely related to the Welsh Springer Spaniel and very closely to the English Cocker Spaniel; less than a century ago, springers and cockers would come from the same litter. The smaller "cockers" were used in woodcock hunt


In [10]:
# Perform tokenization, lemmatization, and stemming on the scraped text.
scraped_tokens_nltk = tokenize_nltk(scraped_text)
scraped_tokens_spacy = tokenize_spacy(scraped_text)
scraped_lemmas_nltk = lemmatize_nltk(scraped_tokens_nltk)
scraped_lemmas_spacy = lemmatize_spacy(scraped_text)
scraped_stems_nltk = stem_nltk(scraped_tokens_nltk)
scraped_stems_spacy = stem_spacy(scraped_text)

In [11]:
# Print word statistics on the scraped data before and after text processing.
print("Scraped text statistics before processing:", updated_statistics([scraped_tokens_nltk]))
print("Scraped text statistics after tokenization (nltk):", updated_statistics([scraped_tokens_nltk]))
print("Scraped text statistics after lemmatization (nltk):", updated_statistics([scraped_lemmas_nltk]))
print("Scraped text statistics after stemming (nltk):", updated_statistics([scraped_stems_nltk]))
print("Scraped text statistics after tokenization (spaCy):", updated_statistics([scraped_tokens_spacy]))
print("Scraped text statistics after lemmatization (spaCy):", updated_statistics([scraped_lemmas_spacy]))
print("Scraped text statistics after stemming (spaCy):", updated_statistics([scraped_stems_spacy]))


Scraped text statistics before processing: ([('the', 102), ('.', 67), (',', 63), ('and', 45), ('to', 44)], 390)
Scraped text statistics after tokenization (nltk): ([('the', 102), ('.', 67), (',', 63), ('and', 45), ('to', 44)], 390)
Scraped text statistics after lemmatization (nltk): ([('the', 102), ('.', 67), (',', 63), ('and', 45), ('to', 44)], 368)
Scraped text statistics after stemming (nltk): ([('the', 102), ('.', 67), (',', 63), ('and', 45), ('to', 44)], 331)
Scraped text statistics after tokenization (spaCy): ([('the', 102), ('and', 45), ('to', 44), ('a', 33), ('of', 33)], 438)
Scraped text statistics after lemmatization (spaCy): ([('the', 103), ('be', 62), (',', 56), ('and', 45), ('to', 44)], 358)
Scraped text statistics after stemming (spaCy): ([('the', 103), ('be', 62), (',', 56), ('and', 45), ('to', 44)], 358)


### WhatsApp Analysis

In [33]:
# Import a .txt file of at least 50 WhatsApp messages in Hebrew.
with open('whatsapp.txt', 'r', encoding='utf-8') as file:
    whatsapp_text = file.readlines()
    all_text = ' '.join(whatsapp_text)
print(all_text[:300])

[02/01/2024 13:10] +972 54-567-2517: השיעור התחיל ?כי הזום לא פעיל
 [02/01/2024 13:11] לירן מדמח: לא
 [02/01/2024 13:11] +972 54-567-2517: תודה
 [02/01/2024 14:00] לירן מדמח: תתנו לעמית להיות מנהל בבקשה
 [02/01/2024 14:00] +972 54-426-2231: כן תתנו לי
 [02/01/2024 14:05] לירן מדמח: למה אתה לא פה
 [0


In [41]:


# Tokenize, lemmatize, and stem the WhatsApp data.

def display_word_statistics(tokens, title):
    total_words = len(tokens)
    word_counts = Counter(tokens)
    most_common_words = word_counts.most_common(5)
    print(f"{title} - Total number of words: {total_words}")
    print(f"{title} - Top 5 most frequent words:")
    for word, count in most_common_words:
        print(f"{word}: {count}")
    print()


def tokenize_hebrew(text):
    tokens = word_tokenize(text)
    hebrew_tokens = [token for token in tokens if token.isalnum() and any('\u0590' <= char <= '\u05EA' for char in token)]
    return hebrew_tokens

# דוגמה לשימוש:
hebrew_tokens = tokenize_hebrew(all_text)
print(hebrew_tokens[:100])
display_word_statistics(hebrew_tokens, "hebrew_tokens")


['השיעור', 'התחיל', 'כי', 'הזום', 'לא', 'פעיל', 'לירן', 'מדמח', 'לא', 'תודה', 'לירן', 'מדמח', 'תתנו', 'לעמית', 'להיות', 'מנהל', 'בבקשה', 'כן', 'תתנו', 'לי', 'לירן', 'מדמח', 'למה', 'אתה', 'לא', 'פה', 'לירן', 'מדמח', 'בואו', 'הוא', 'מתחיל', 'לירן', 'מדמח', 'לירן', 'מדמח', 'התחלנו', 'לא', 'רואים', 'אתכם', 'בזום', 'לירן', 'מדמח', 'חוזרים', 'ביקשת', 'לתזכר', 'אותך', 'כאן', 'להעלות', 'למודל', 'סמינר', 'דוגמא', 'מסמך', 'בסיס', 'לסמינר', 'תודה', 'לירן', 'מדמח', 'חזרנו', 'ללמוד', 'מה', 'שלומכם', 'נשאלתי', 'לגבי', 'תוצרים', 'של', 'הקורס', 'להרחיב', 'את', 'הדעת', 'ב', 'ללמוד', 'לקרוא', 'לשחזר', 'את', 'הקוד', 'שמוצג', 'במאמר', 'לערוך', 'דיון', 'בתוצאות', 'השוואה', 'לריצות', 'שמאמר', 'השוואה', 'לאלגוריתמים', 'בין', 'אלגוריתמים', 'וכו', 'איפה', 'אנחנו', 'היום', 'סטס', 'שליח', 'פיליפ', 'פיקוס', 'לא', 'לשכוח', 'מאנשי', 'לירן', 'מדמח']
hebrew_tokens - Total number of words: 815
hebrew_tokens - Top 5 most frequent words:
לירן: 17
מדמח: 16
לא: 13
של: 13
את: 12



In [42]:
def lemmatize_hebrew(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    hebrew_tokens = [token for token in tokens if token.isalnum() and any('\u0590' <= char <= '\u05EA' for char in token)]

    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in hebrew_tokens]
    return lemmatized_tokens


lemmatized_tokens = lemmatize_hebrew(all_text)

print(lemmatized_tokens[:100])
display_word_statistics(lemmatized_tokens, "lemmatized_tokens")



['השיעור', 'התחיל', 'כי', 'הזום', 'לא', 'פעיל', 'לירן', 'מדמח', 'לא', 'תודה', 'לירן', 'מדמח', 'תתנו', 'לעמית', 'להיות', 'מנהל', 'בבקשה', 'כן', 'תתנו', 'לי', 'לירן', 'מדמח', 'למה', 'אתה', 'לא', 'פה', 'לירן', 'מדמח', 'בואו', 'הוא', 'מתחיל', 'לירן', 'מדמח', 'לירן', 'מדמח', 'התחלנו', 'לא', 'רואים', 'אתכם', 'בזום', 'לירן', 'מדמח', 'חוזרים', 'ביקשת', 'לתזכר', 'אותך', 'כאן', 'להעלות', 'למודל', 'סמינר', 'דוגמא', 'מסמך', 'בסיס', 'לסמינר', 'תודה', 'לירן', 'מדמח', 'חזרנו', 'ללמוד', 'מה', 'שלומכם', 'נשאלתי', 'לגבי', 'תוצרים', 'של', 'הקורס', 'להרחיב', 'את', 'הדעת', 'ב', 'ללמוד', 'לקרוא', 'לשחזר', 'את', 'הקוד', 'שמוצג', 'במאמר', 'לערוך', 'דיון', 'בתוצאות', 'השוואה', 'לריצות', 'שמאמר', 'השוואה', 'לאלגוריתמים', 'בין', 'אלגוריתמים', 'וכו', 'איפה', 'אנחנו', 'היום', 'סטס', 'שליח', 'פיליפ', 'פיקוס', 'לא', 'לשכוח', 'מאנשי', 'לירן', 'מדמח']
lemmatized_tokens - Total number of words: 815
lemmatized_tokens - Top 5 most frequent words:
לירן: 17
מדמח: 16
לא: 13
של: 13
את: 12



In [43]:
def stem_hebrew(text):
    stemmer = SnowballStemmer("porter")
    tokens = word_tokenize(text)
    hebrew_tokens = [token for token in tokens if token.isalnum() and any('\u0590' <= char <= '\u05EA' for char in token)]
    stemmed_tokens = [stemmer.stem(token) for token in hebrew_tokens]
    return stemmed_tokens
stemmed_tokens = stem_hebrew(all_text)

print(stemmed_tokens[:100])
display_word_statistics(stemmed_tokens, "stemmed_tokens")

['השיעור', 'התחיל', 'כי', 'הזום', 'לא', 'פעיל', 'לירן', 'מדמח', 'לא', 'תודה', 'לירן', 'מדמח', 'תתנו', 'לעמית', 'להיות', 'מנהל', 'בבקשה', 'כן', 'תתנו', 'לי', 'לירן', 'מדמח', 'למה', 'אתה', 'לא', 'פה', 'לירן', 'מדמח', 'בואו', 'הוא', 'מתחיל', 'לירן', 'מדמח', 'לירן', 'מדמח', 'התחלנו', 'לא', 'רואים', 'אתכם', 'בזום', 'לירן', 'מדמח', 'חוזרים', 'ביקשת', 'לתזכר', 'אותך', 'כאן', 'להעלות', 'למודל', 'סמינר', 'דוגמא', 'מסמך', 'בסיס', 'לסמינר', 'תודה', 'לירן', 'מדמח', 'חזרנו', 'ללמוד', 'מה', 'שלומכם', 'נשאלתי', 'לגבי', 'תוצרים', 'של', 'הקורס', 'להרחיב', 'את', 'הדעת', 'ב', 'ללמוד', 'לקרוא', 'לשחזר', 'את', 'הקוד', 'שמוצג', 'במאמר', 'לערוך', 'דיון', 'בתוצאות', 'השוואה', 'לריצות', 'שמאמר', 'השוואה', 'לאלגוריתמים', 'בין', 'אלגוריתמים', 'וכו', 'איפה', 'אנחנו', 'היום', 'סטס', 'שליח', 'פיליפ', 'פיקוס', 'לא', 'לשכוח', 'מאנשי', 'לירן', 'מדמח']
stemmed_tokens - Total number of words: 815
stemmed_tokens - Top 5 most frequent words:
לירן: 17
מדמח: 16
לא: 13
של: 13
את: 12

