In [1]:
# Cleaner version 1 
# Credit - Casey Staples
# Date - 10/3/2023

from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import os
import glob



all_files_text = []

# Grabbing each 'section' of articles for each group ex. all the sports articles in group 1. 
# I could probably automate this for all the articles, but I wanted to fine tune things for each group
sports_files = [file for file in glob.glob(r'C:\Users\Casey\Documents\GitHub\web-text-mining\articles\group_01_articles\*_sports.txt') if os.path.isfile(file)]

# Read each file and append the text to a list
for file in sports_files:
    with open(file, 'r') as f:
        all_files_text.append(f.read())


all_files_text

["Author: James Colgan\nDate: September 23, 2023\nPublication: golf.com/news\n\nLexi Thompson's Solheim Cup shank wasn't the talk of the golf world on Saturday — but her response to it was.\nGetty Images\nCASARES, Spain — Early in my GOLF.com career, I caught a devastating case of the shanks.\nIt was fall 2021 and I was in Southern California for a video series based around a swing lesson from legendary swing coach Dave Phillips. We hired a videographer and flew in our photographer and two fellow writers for the occasion, taking over the hulking Titleist Performance Institute for a few hours. (The other attendees won’t be named for purposes of protecting the innocent.)\nAfter a few minutes of polite conversation, we turned the cameras on and started swinging — which was also the moment I realized I was in for the worst hour of my life. I felt like I swung the club the same way I had for the previous six months of my life, but the result was decidedly not anything I was used to seeing. 

In [2]:
# Git rid of garabage characters
text = [x.replace('\n', ' ') for x in all_files_text]
text = [x.replace('\t', ' ') for x in text]

print('The article currently has {} sentences/paragraphs.'.format(len(text)))

# Set the text to lower case
text = [x.lower() for x in text]
text


The article currently has 16 sentences/paragraphs.


["author: james colgan date: september 23, 2023 publication: golf.com/news  lexi thompson's solheim cup shank wasn't the talk of the golf world on saturday — but her response to it was. getty images casares, spain — early in my golf.com career, i caught a devastating case of the shanks. it was fall 2021 and i was in southern california for a video series based around a swing lesson from legendary swing coach dave phillips. we hired a videographer and flew in our photographer and two fellow writers for the occasion, taking over the hulking titleist performance institute for a few hours. (the other attendees won’t be named for purposes of protecting the innocent.) after a few minutes of polite conversation, we turned the cameras on and started swinging — which was also the moment i realized i was in for the worst hour of my life. i felt like i swung the club the same way i had for the previous six months of my life, but the result was decidedly not anything i was used to seeing. instead 

In [3]:



#print('The article currently has {} sentences/paragraphs.'.format(len(text)))

In [4]:
# tokenize text
text = [word_tokenize(x) for x in text]

# tag the text
tagged_text = [pos_tag(x) for x in text]

tagged_text

[[('author', 'NN'),
  (':', ':'),
  ('james', 'NNS'),
  ('colgan', 'VBP'),
  ('date', 'NN'),
  (':', ':'),
  ('september', 'NN'),
  ('23', 'CD'),
  (',', ','),
  ('2023', 'CD'),
  ('publication', 'NN'),
  (':', ':'),
  ('golf.com/news', 'NNS'),
  ('lexi', 'VBP'),
  ('thompson', 'NN'),
  ("'s", 'POS'),
  ('solheim', 'JJ'),
  ('cup', 'NN'),
  ('shank', 'NN'),
  ('was', 'VBD'),
  ("n't", 'RB'),
  ('the', 'DT'),
  ('talk', 'NN'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('golf', 'NN'),
  ('world', 'NN'),
  ('on', 'IN'),
  ('saturday', 'JJ'),
  ('—', 'NNS'),
  ('but', 'CC'),
  ('her', 'PRP$'),
  ('response', 'NN'),
  ('to', 'TO'),
  ('it', 'PRP'),
  ('was', 'VBD'),
  ('.', '.'),
  ('getty', 'JJ'),
  ('images', 'NNS'),
  ('casares', 'NNS'),
  (',', ','),
  ('spain', 'VBP'),
  ('—', 'JJ'),
  ('early', 'JJ'),
  ('in', 'IN'),
  ('my', 'PRP$'),
  ('golf.com', 'NN'),
  ('career', 'NN'),
  (',', ','),
  ('i', 'VB'),
  ('caught', 'VBD'),
  ('a', 'DT'),
  ('devastating', 'JJ'),
  ('case', 'NN'),
  ('of', 

In [5]:
# Setup
lemmatizer = WordNetLemmatizer()
lemmatized_text = []


# Update the part of speech tags to be compatible with the lemmatizer
# Note: I didn't expilcitly use the univeral tagset, I might change it later.
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    elif treebank_tag.startswith('S'):
        return wordnet.ADJ_SAT
    else:
        return wordnet.NOUN
    
for sentence in tagged_text:
    lemmatized_text.append([lemmatizer.lemmatize(word[0], get_wordnet_pos(word[1])) for word in sentence])


print('The article currently has {} tokens.'.format(len(lemmatized_text)))

lemmatized_text

The article currently has 16 tokens.


[['author',
  ':',
  'james',
  'colgan',
  'date',
  ':',
  'september',
  '23',
  ',',
  '2023',
  'publication',
  ':',
  'golf.com/news',
  'lexi',
  'thompson',
  "'s",
  'solheim',
  'cup',
  'shank',
  'be',
  "n't",
  'the',
  'talk',
  'of',
  'the',
  'golf',
  'world',
  'on',
  'saturday',
  '—',
  'but',
  'her',
  'response',
  'to',
  'it',
  'be',
  '.',
  'getty',
  'image',
  'casares',
  ',',
  'spain',
  '—',
  'early',
  'in',
  'my',
  'golf.com',
  'career',
  ',',
  'i',
  'catch',
  'a',
  'devastating',
  'case',
  'of',
  'the',
  'shank',
  '.',
  'it',
  'be',
  'fall',
  '2021',
  'and',
  'i',
  'be',
  'in',
  'southern',
  'california',
  'for',
  'a',
  'video',
  'series',
  'base',
  'around',
  'a',
  'swing',
  'lesson',
  'from',
  'legendary',
  'swing',
  'coach',
  'dave',
  'phillips',
  '.',
  'we',
  'hire',
  'a',
  'videographer',
  'and',
  'flew',
  'in',
  'our',
  'photographer',
  'and',
  'two',
  'fellow',
  'writer',
  'for',
  'th

In [6]:
# create the stop list array, I've been adding to this as I've been going through the output
stops = list(set(stopwords.words('english'))) + [',', '.' , '-', 'however', 'ever' , 'also', '?' , '#', '@' ,'(', ')', "'s", "n't" , '``', "''", 
 "--"]

# get rid of stop words
text_no_stops = [[word for word in sentence if word not in stops] for sentence in lemmatized_text]

# get rid of punctuation
text_no_stops = [[word for word in sentence if word not in string.punctuation] for sentence in text_no_stops]

# get rid of works that contain numbers
text_no_stops = [[word for word in sentence if not any(char.isdigit() for char in word)] for sentence in text_no_stops]

# get rid of single character words
text_no_stops = [[word for word in sentence if len(word) > 1] for sentence in text_no_stops]

# get rid of contractions
text_no_stops = [[word for word in sentence if "'" not in word] for sentence in text_no_stops]

# get rid of .com
text_no_stops = [[word for word in sentence if ".com" not in word] for sentence in text_no_stops]

# get rid of non-printable characters
text_no_stops = [[word for word in sentence if word.isprintable()] for sentence in text_no_stops]

# get rid of emojis by looking for non-ascii characters
text_no_stops = [[word for word in sentence if word.encode('ascii', 'ignore').decode('ascii') == word] for sentence in text_no_stops]

# get rid of words that contain - / or . 
text_no_stops = [[word for word in sentence if '-' not in word] for sentence in text_no_stops]
text_no_stops = [[word for word in sentence if '/' not in word] for sentence in text_no_stops]
text_no_stops = [[word for word in sentence if '.' not in word] for sentence in text_no_stops]

text_no_stops

[['author',
  'james',
  'colgan',
  'date',
  'september',
  'publication',
  'lexi',
  'thompson',
  'solheim',
  'cup',
  'shank',
  'talk',
  'golf',
  'world',
  'saturday',
  'response',
  'getty',
  'image',
  'casares',
  'spain',
  'early',
  'career',
  'catch',
  'devastating',
  'case',
  'shank',
  'fall',
  'southern',
  'california',
  'video',
  'series',
  'base',
  'around',
  'swing',
  'lesson',
  'legendary',
  'swing',
  'coach',
  'dave',
  'phillips',
  'hire',
  'videographer',
  'flew',
  'photographer',
  'two',
  'fellow',
  'writer',
  'occasion',
  'take',
  'hulking',
  'titleist',
  'performance',
  'institute',
  'hour',
  'attendee',
  'win',
  'name',
  'purpose',
  'protect',
  'innocent',
  'minute',
  'polite',
  'conversation',
  'turn',
  'camera',
  'start',
  'swing',
  'moment',
  'realize',
  'bad',
  'hour',
  'life',
  'felt',
  'like',
  'swing',
  'club',
  'way',
  'previous',
  'six',
  'month',
  'life',
  'result',
  'decidedly',
  'a

In [7]:

# get rid of duplicate words in each sentence
text_no_stops = [list(set(sentence)) for sentence in text_no_stops]

text_no_stops



[['win',
  'gender',
  'purpose',
  'vertical',
  'championship',
  'consecutive',
  'decade',
  'underexposed',
  'green',
  'role',
  'ask',
  'long',
  'question',
  'purse',
  'titleist',
  'write',
  'disappointed',
  'felt',
  'new',
  'understand',
  'hulking',
  'consider',
  'era',
  'episode',
  'stub',
  'play',
  'prior',
  'astute',
  'speechless',
  'pretty',
  'nearly',
  'funny',
  'saw',
  'illustrate',
  'unchecked',
  'nobody',
  'tricky',
  'excuse',
  'performance',
  'field',
  'come',
  'want',
  'afford',
  'though',
  'toe',
  'exist',
  'somewhere',
  'quickly',
  'thing',
  'video',
  'receive',
  'defense',
  'single',
  'limit',
  'flew',
  'inherently',
  'writer',
  'island',
  'yet',
  'former',
  'university',
  'coworkers',
  'adrenaline',
  'accountability',
  'reason',
  'rude',
  'devastating',
  'feeling',
  'already',
  'impactful',
  'catch',
  'immediate',
  'terrible',
  'scrutiny',
  'softball',
  'note',
  'see',
  'human',
  'tournament',
  

In [None]:
# save each cleaned article to a new file
for i in range(len(text_no_stops)):
    with open('01{}_cleaned.txt'.format(i), 'w') as f:
        f.write(' '.join(text_no_stops[i]))