In [20]:
# imports

import urllib.request
import nltk
nltk.download('punkt')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import RegexpTokenizer


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
# load the data
shakespeare_url = "http://www.cs.columbia.edu/~sarahita/CL/lab1/shakespeare.txt"
news_url = "http://www.cs.columbia.edu/~sarahita/CL/lab1/news.txt"
swbd_url = "http://www.cs.columbia.edu/~sarahita/CL/lab1/swbd.txt"

# read url .txt file into string "data"
def get_data(url):
  data = urllib.request.urlopen(url).read().decode('utf-8')
  return data

shakespeare_data = get_data(shakespeare_url)
news_data = get_data(news_url)
swbd_data = get_data(swbd_url)

In [22]:
print(swbd_data[:100])


A.1: Uh, do you have a pet Randy? 
B.2: Uh, yeah, currently we have a poodle. 
A.3: A poodle, minia


**Word Tokenization**

Tokenize each of the 3 datasets using the NLTK word tokenizer. 
Count the number of tokens for each dataset, as well as the vocabulary size (number of unique tokens).  Record the results in the table below.  (1 pt)

In [23]:
tokenized_shakespeare = word_tokenize(shakespeare_data)
tokenized_news = word_tokenize(news_data)
tokenized_swbd = word_tokenize(swbd_data)

numTokens_shakespeare = len(tokenized_shakespeare)
numTokens_news = len(tokenized_news)
numTokens_swbd = len(tokenized_swbd)

vocabSize_shakespeare = len(set(tokenized_shakespeare))
vocabSize_news = len(set(tokenized_news))
vocabSize_swbd = len(set(tokenized_swbd))

vocabSize_shakespeare/numTokens_shakespeare
vocabSize_news/numTokens_news
vocabSize_swbd/numTokens_swbd

top10_shakespeare = nltk.FreqDist(tokenized_shakespeare).most_common(10)
top10_news = nltk.FreqDist(tokenized_news).most_common(10)
top10_swbd = nltk.FreqDist(tokenized_swbd).most_common(10)

**Normalization**


In this next step we will first normalize the tokenized text following these steps (in order):

Convert all text to lowercase

Remove punctuation (use string.punctuation for a list of punctuation marks)

Remove stopwords (use nltk.stopwords for English)

In [24]:
pip install textstat

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [25]:
import string 
import textstat
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
lower_shakespeare = [w.lower() for w in tokenized_shakespeare]
lower_news = [w.lower() for w in tokenized_news]
lower_swbd = [w.lower() for w in tokenized_swbd]

punctuation = set(string.punctuation)

noPunctuation_shakespeare = [word for word in lower_shakespeare if word not in punctuation and word != '’']
noPunctuation_news = [word for word in lower_news if word not in punctuation and word != '’']
noPunctuation_swbd = [word for word in lower_swbd if word not in punctuation and word != '’']

stopWords = set(stopwords.words('english'))

nostopWords_shakespeare = [word for word in noPunctuation_shakespeare if word not in stopWords]
nostopWords_news = [word for word in noPunctuation_news if word not in stopWords]
nostopWords_swbd = [word for word in noPunctuation_swbd if word not in stopWords]

normalized_shakespeare = nostopWords_shakespeare
normalized_news = nostopWords_news
normalized_swbd = nostopWords_swbd

numTokens_shakespeare = len(normalized_shakespeare)
numTokens_news = len(normalized_news)
numTokens_swbd = len(normalized_swbd)

vocabSize_shakespeare = len(set(normalized_shakespeare))
vocabSize_news = len(set(normalized_news))
vocabSize_swbd = len(set(normalized_swbd))

vocabSize_shakespeare/numTokens_shakespeare
vocabSize_news/numTokens_news
vocabSize_swbd/numTokens_swbd

top10_shakespeare = nltk.FreqDist(normalized_shakespeare).most_common(10)
top10_news = nltk.FreqDist(normalized_news).most_common(10)
top10_swbd = nltk.FreqDist(normalized_swbd).most_common(10)

In [27]:
shakespeare_data = shakespeare_data[:20000]
news_data = news_data[:20000]
swbd_data = swbd_data[:20000]

#TTR

numTokens_shakespeare = len(shakespeare_data)
numTokens_news = len(news_data)
numTokens_swbd = len(swbd_data)

numTypes_shakespeare = len(set(shakespeare_data))
numTypes_news = len(set(news_data))
numTypes_swbd = len(set(swbd_data))

#TTR (Type to Token Ratio)
ttr_shakespeare = numTypes_shakespeare/numTokens_shakespeare
ttr_news = numTypes_news/numTokens_news
ttr_swbd = numTypes_swbd/numTokens_swbd

#Flesch Reading Ease Score.
#assess the ease of readability in a document
#maximum score is 121.22, there is no limit on how low the score can be
flesch_shakespeare = textstat.flesch_reading_ease(shakespeare_data)
flesch_news = textstat.flesch_reading_ease(news_data)
flesch_data = textstat.flesch_reading_ease(swbd_data)

#Readability Consensus based upon all the above tests
#Based upon all the above tests, returns the estimated school grade level required to understand the text.
consensus_shakespeare = textstat.text_standard(shakespeare_data)
consensus_news = textstat.text_standard(news_data)
consensus_data = textstat.text_standard(swbd_data)

# McAlpine EFLAW Readability Score
# Returns a score for the readability of an english text for a foreign learner or English, focusing on the number of miniwords and length of sentences.
mcalpine_eflaw_shakespeare = textstat.mcalpine_eflaw(shakespeare_data)
mcalpine_eflaw_news = textstat.mcalpine_eflaw(news_data)
mcalpine_eflaw_data = textstat.mcalpine_eflaw(swbd_data)