# Importing dependencies

In [1]:
#dependency libraries installed as needed
#pip install autocorrect

In [2]:
import pandas as pd
import os
from bs4 import BeautifulSoup as bs
import re
import glob
import nltk
import pprint
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import Text
from nltk import FreqDist
from nltk.corpus import PlaintextCorpusReader #NLTK Reference Book 1.9: Loading your own corpus
from nltk.corpus import wordnet as wn #lemmatize -- using synonyms - identifying similar terms within a corpus
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
#is based on The Porter Stemming Algorithm

# Using python and nltk

## Parsing html.txt files

In [3]:
# Reading article full text in for processing -- article_fulltext is text from: 
    #journal-article-10.1086_380851.txt (Library Quarterly, vol. 74, no. 1, 2004)

with open("article_fulltext.txt", encoding='utf8') as infile: 
    html_doc = bs(infile, "html.parser")
html_doc

<body xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<sec id="sc1">
<title>Introduction</title>
<p>They call to us from newsstands and racks by grocery store checkouts, the bright colors on their covers competing with the candy displays; teen magazines are the literary sweet that satisfies the craving for normalcy. The covers of the latest teen magazines aimed at adolescent girls promise readers dates, beauty, and success, and feature celebrity role models smiling beneath the mastheads. The experience of reading the glossy booklets seems ultimately depressing; compared to the rich superstar singer, the skinniest model, even the glamorous writers, the reader would require an identity facelift to compete with these supposed peers. Yet teen magazines continue to flourish as, in the last five years, traditionally women’s magazines have released “little sister” editions of their titles, includi

## Selecting "p" tags and extracting all text then converting to lower case.

In [4]:
paragraphs=html_doc.find_all('p')
file_text = ''
for para in paragraphs:
    file_text += para.text 

# print (para.text)    
file_text = file_text.lower()
# paragraphs
# file_text

## Removing everything except letters, periods, and spaces.

In [5]:
file_text = re.sub(r'[^A-Za-z. ]', '', file_text)
file_text

'they call to us from newsstands and racks by grocery store checkouts the bright colors on their covers competing with the candy displays teen magazines are the literary sweet that satisfies the craving for normalcy. the covers of the latest teen magazines aimed at adolescent girls promise readers dates beauty and success and feature celebrity role models smiling beneath the mastheads. the experience of reading the glossy booklets seems ultimately depressing compared to the rich superstar singer the skinniest model even the glamorous writers the reader would require an identity facelift to compete with these supposed peers. yet teen magazines continue to flourish as in the last five years traditionally womens magazines have released little sister editions of their titles including the bestselling cosmogirl in  teentargeted magazines seventeen ym and cosmogirl were among abcs top  circulating magazines with average circulations of   and  respectively mpa resources. amid the arguments ag

In [6]:
type(file_text)

str

## TOKENIZE - to run: Concordance, Lemmitizing, Stopwords

In [7]:
# to use NLTK functions such as concordance, lemmitize, stopwords on text files -- 
# you have to run tokenize command on text:

tokens = word_tokenize(file_text)
textNLTK = Text(tokens)

textNLTK.concordance("feminine")

Displaying 16 of 16 matches:
e and reader represents a distinct feminine space while the demands of the tex
 and girls magazines celebrate the feminine inner space emphasized in the girl
ns and girls magazines to create a feminine space . many girls magazines begin
cognize that magazine standards of feminine beauty are unrealistic experience 
eriod or the view by others of any feminine hygiene product beauty blunders cl
wrapped tampons or the purchase of feminine hygiene items . additional harbing
hand present beneath the text . if feminine success is measured by the avoidan
fied these ideas of selfdenial and feminine presentation in their book underst
cern with normalizing her body the feminine adolescent body is constructed in 
laster and colleagues the use of a feminine discourse the manipulation and cre
rtising is displayed in a uniquely feminine space erikson where as ballaster w
lly accepted by womenincluding the feminine virtues of passive goodness person
ation of eriksons study

In [8]:
#lemmitizing - similar to tokenizing -- needs to be done before concordance, stopwords, lemmitizing
wordnet_lemmatizer = WordNetLemmatizer()
stopword = stopwords.words('english')
snowball_stemmer = SnowballStemmer('english')
lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for 
                   word in textNLTK]

#lemmatized_words
# stemmed_words = [snowball_stemmer.stem(word) for word in textNLTK]
# stemmed_words

In [9]:
#tokenizing sentences -- maybe more useful for words for keeping context?

sent_token = nltk.sent_tokenize(file_text)
print (sent_token)

['they call to us from newsstands and racks by grocery store checkouts the bright colors on their covers competing with the candy displays teen magazines are the literary sweet that satisfies the craving for normalcy.', 'the covers of the latest teen magazines aimed at adolescent girls promise readers dates beauty and success and feature celebrity role models smiling beneath the mastheads.', 'the experience of reading the glossy booklets seems ultimately depressing compared to the rich superstar singer the skinniest model even the glamorous writers the reader would require an identity facelift to compete with these supposed peers.', 'yet teen magazines continue to flourish as in the last five years traditionally womens magazines have released little sister editions of their titles including the bestselling cosmogirl in  teentargeted magazines seventeen ym and cosmogirl were among abcs top  circulating magazines with average circulations of   and  respectively mpa resources.', 'amid the

In [10]:
#stopwords - don't need nltk before word_tokenize for this step, but do need for some other commands
word_tokens = word_tokenize(file_text)
text_cleaned = [word for word in word_tokens if word not in stopword]
print (text_cleaned)


['call', 'us', 'newsstands', 'racks', 'grocery', 'store', 'checkouts', 'bright', 'colors', 'covers', 'competing', 'candy', 'displays', 'teen', 'magazines', 'literary', 'sweet', 'satisfies', 'craving', 'normalcy', '.', 'covers', 'latest', 'teen', 'magazines', 'aimed', 'adolescent', 'girls', 'promise', 'readers', 'dates', 'beauty', 'success', 'feature', 'celebrity', 'role', 'models', 'smiling', 'beneath', 'mastheads', '.', 'experience', 'reading', 'glossy', 'booklets', 'seems', 'ultimately', 'depressing', 'compared', 'rich', 'superstar', 'singer', 'skinniest', 'model', 'even', 'glamorous', 'writers', 'reader', 'would', 'require', 'identity', 'facelift', 'compete', 'supposed', 'peers', '.', 'yet', 'teen', 'magazines', 'continue', 'flourish', 'last', 'five', 'years', 'traditionally', 'womens', 'magazines', 'released', 'little', 'sister', 'editions', 'titles', 'including', 'bestselling', 'cosmogirl', 'teentargeted', 'magazines', 'seventeen', 'ym', 'cosmogirl', 'among', 'abcs', 'top', 'circu

In [11]:
#lemmatize -- using synonyms - identifying similar terms within a corpus - uses wordnet: NLTK Reference 2.5.1

wn.synsets('woman')

[Synset('woman.n.01'),
 Synset('woman.n.02'),
 Synset('charwoman.n.01'),
 Synset('womanhood.n.02')]

In [12]:
# wn.synset('woman.n.01').lemma_names() # synonym set for term
# wn.synset('woman.n.01').definition() # definition for term
wn.lemmas('woman') #access all versions of term at once

[Lemma('woman.n.01.woman'),
 Lemma('woman.n.02.woman'),
 Lemma('charwoman.n.01.woman'),
 Lemma('womanhood.n.02.woman')]

In [13]:
# hyponyms search - across possible similar terms -- ones for woman are shocking -- 
# but useful for literacy, writing in similar search sets 
 
term_test1 = wn.synset('woman.n.01')
types_of_test1 = term_test1.hyponyms()
sorted(lemma.name() for synset in types_of_test1 for lemma in synset.lemmas())

['B-girl',
 'Black_woman',
 'Cinderella',
 'Delilah',
 'Wac',
 'Wave',
 'amazon',
 'bachelor_girl',
 'bachelorette',
 'baggage',
 'ball-breaker',
 'ball-buster',
 'bar_girl',
 'bas_bleu',
 'bawd',
 'beauty',
 'bluestocking',
 'bridesmaid',
 'broad',
 'cat',
 'cocotte',
 'coquette',
 'cyprian',
 'dame',
 'deb',
 'debutante',
 'dish',
 'divorcee',
 'dominatrix',
 'donna',
 'enchantress',
 'ex',
 'ex-wife',
 'eyeful',
 'fancy_woman',
 'fancy_woman',
 'femme_fatale',
 'fille',
 'flirt',
 'geisha',
 'geisha_girl',
 'gentlewoman',
 'girl',
 'girl',
 'girl',
 'girlfriend',
 'girlfriend',
 'gold_digger',
 'grass_widow',
 'gravida',
 'harlot',
 'heroine',
 'houri',
 'inamorata',
 'jezebel',
 'jilt',
 'kept_woman',
 'knockout',
 'lady',
 'lady',
 'lady_friend',
 'lady_of_pleasure',
 'looker',
 'lulu',
 "ma'am",
 'madam',
 'maenad',
 'maenad',
 'maid_of_honor',
 'mantrap',
 'married_woman',
 'materfamilias',
 'matriarch',
 'matriarch',
 'matron',
 'mestiza',
 'minx',
 'miss',
 'missy',
 'mistress

In [14]:

# file_textNLTK.similar("SEARCHTERM") 

fdist1 = FreqDist(text_cleaned)
fdist1

FreqDist({'.': 220, 'magazines': 73, 'teen': 57, 'girls': 53, 'magazine': 42, 'may': 39, 'p.': 39, 'readers': 35, 'reader': 34, 'identity': 30, ...})

## Generating ngrams

In [15]:
def ngrams(input, n):
    input = input.split(' ')
    output = []
    for i in range(len(input)-n+1):
        output.append(input[i:i+n])
    return output

In [16]:
# bigram on pre-tokenized text - error on post-tokenized -- NEED TO INVESTIGATE

bigram = ngrams(file_text, 2)
bigram

[['they', 'call'],
 ['call', 'to'],
 ['to', 'us'],
 ['us', 'from'],
 ['from', 'newsstands'],
 ['newsstands', 'and'],
 ['and', 'racks'],
 ['racks', 'by'],
 ['by', 'grocery'],
 ['grocery', 'store'],
 ['store', 'checkouts'],
 ['checkouts', 'the'],
 ['the', 'bright'],
 ['bright', 'colors'],
 ['colors', 'on'],
 ['on', 'their'],
 ['their', 'covers'],
 ['covers', 'competing'],
 ['competing', 'with'],
 ['with', 'the'],
 ['the', 'candy'],
 ['candy', 'displays'],
 ['displays', 'teen'],
 ['teen', 'magazines'],
 ['magazines', 'are'],
 ['are', 'the'],
 ['the', 'literary'],
 ['literary', 'sweet'],
 ['sweet', 'that'],
 ['that', 'satisfies'],
 ['satisfies', 'the'],
 ['the', 'craving'],
 ['craving', 'for'],
 ['for', 'normalcy.'],
 ['normalcy.', 'the'],
 ['the', 'covers'],
 ['covers', 'of'],
 ['of', 'the'],
 ['the', 'latest'],
 ['latest', 'teen'],
 ['teen', 'magazines'],
 ['magazines', 'aimed'],
 ['aimed', 'at'],
 ['at', 'adolescent'],
 ['adolescent', 'girls'],
 ['girls', 'promise'],
 ['promise', 'reade

In [17]:
type(bigram)

list

In [18]:
df=pd.DataFrame(bigram, columns =['T1', 'T2'])

df.head()


Unnamed: 0,T1,T2
0,they,call
1,call,to
2,to,us
3,us,from
4,from,newsstands


In [19]:
df2=df.groupby(["T1", "T2"]).size().reset_index(name='count')
# df2 = df[df.T1 != "."]
df2

Unnamed: 0,T1,T2,count
0,,,1
1,,.,2
2,,a,2
3,,adults,1
4,,analysis,1
5,,and,6
6,,as,1
7,,at,1
8,,book,1
9,,circulating,1


In [20]:
#Save to csv file
df2.to_csv(r'test_380851.csv')


To add filenames to column - 
2 steps:
get all of the filenames into one list - function - for loop - list within - 
need new column - filename
append list as a new column in a dataframe
SLACK STACK OVERFLOW -- 
