## Installing NLK and Checking Library 

In [1]:
import nltk

In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
dir(nltk)

['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'Cistem',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGrap

In [4]:
from nltk.corpus import stopwords

In [6]:
stopwords.words('english')[:500:25]

['i', 'herself', 'been', 'with', 'here', 'very', 'doesn', 'won']

## Reading Unstructured Text Data

In [15]:
rawData = open('SMSSpamCollection.tsv').read()

In [16]:
rawData[:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [17]:
parsedData = rawData.replace('\t', '\n').split('\n')

In [18]:
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [19]:
labelList = parsedData[0::2]

In [23]:
textList = parsedData[1::2]

In [21]:
print(labelList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']


In [24]:
print(textList[0:5])

["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [27]:
print(len(labelList))

5571


In [28]:
print(len(textList))

5570


In [25]:
import pandas as pd

In [30]:
fullCorpus = pd.DataFrame({'label':labelList[:-1], 
                          'body_list':textList
                          })
fullCorpus.head()

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [37]:
fullCorpus = pd.read_csv('SMSSpamCollection.tsv', 
                      sep='\t', header=None)

In [39]:
fullCorpus.columns=['label', 'body_text']
fullCorpus.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [46]:
print('Input data has {} rows and  {} columns'.format(len(fullCorpus), 
      len(fullCorpus.columns)))

Input data has 5568 rows and  2 columns


In [52]:
# How many spam/ham are there?
print("Out of {} rows, {} are spam and {} are ham".format(
    len(fullCorpus), 
    len(fullCorpus[fullCorpus['label']=='spam']), 
    len(fullCorpus[fullCorpus['label']=='ham'])
    )
)

Out of 5568 rows, 746 are spam and 4822 are ham


In [57]:
# How much missing data there is?
print("Number of nulls in the label is {}".format(fullCorpus['label'].isnull().sum()))
      
print("Number of nulls in the body is  {}".format(fullCorpus['body_text'].isnull().sum()))

Number of nulls in the label is 0
Number of nulls in the body is  0


## Regular Expressions to Manipulate Text
`nlp`, `[j-q]`, `[j-q]+`, `[0-9]+`, `[j-q0-09]+` ===> Usage `findall()` and `split()`
- Used to identify whitespace between words/tokens
- Create delimiters or end-of-line escape characters
- Removing punctuation or numbers from the text
- Cleaning HTML tags from text

In [59]:
re_test = 'This is a made up string to test 2 different regex methods'
re_test_messy = 'This     is a made up     string to test 2    different regex methods'
re_test_messy2 = 'This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods'


In [60]:
re.split('\s', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [61]:
re.split('\s', re_test_messy)

['This',
 '',
 '',
 '',
 '',
 'is',
 'a',
 'made',
 'up',
 '',
 '',
 '',
 '',
 'string',
 'to',
 'test',
 '2',
 '',
 '',
 '',
 'different',
 'regex',
 'methods']

In [62]:
re.split('\s+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [63]:
re.split('\s+', re_test_messy2)

['This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods']

In [64]:
re.split('\W+', re_test_messy2)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [65]:
re.findall('\S+', re_test)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [66]:
re.findall('\S+', re_test_messy)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

In [67]:
re.findall('\S+', re_test_messy2)

['This-is-a-made/up.string*to>>>>test----2"""""different-regex-methods']

In [69]:
re.findall('\w+', re_test_messy2)

['This',
 'is',
 'a',
 'made',
 'up',
 'string',
 'to',
 'test',
 '2',
 'different',
 'regex',
 'methods']

## Replacing Specific String

In [70]:
pep8_test = 'I try to follow PEP8 guidelines'
pep7_test = 'I try to follow PEP7 guidelines'
peep8_test = 'I try to follow PEEP8 guidelines'

In [75]:
re.findall('[A-Z]+[0-9]+', peep8_test)

['PEEP8']

In [76]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', pep8_test)

'I try to follow PEP8 Python Styleguide guidelines'

In [77]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', pep7_test)

'I try to follow PEP8 Python Styleguide guidelines'

In [78]:
re.sub('[A-Z]+[0-9]+', 'PEP8 Python Styleguide', peep8_test)

'I try to follow PEP8 Python Styleguide guidelines'