# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [2]:
# Read in the raw text
rawData = open("SMSSpamCollection.tsv").read()

# Print the raw data
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [3]:
parsedData = rawData.replace('\t', '\n').split('\n')    # replaces \t with \n so we can split it into lists

In [5]:
parsedData[0:5]    # now it's a list of labels then text, alternating

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [6]:
labelList = parsedData[0::2]    # creates a new list to pull every other item from parsedData list, 0 to end, every other item
textList = parsedData[1::2]     # same, but starting in position 1

In [8]:
print(labelList[0:5])   # prints the labels as a list, then the corresponding text items as a second list
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


In [14]:
# Let's create a datafram called fullCorpus with two columns and the corresponding haders 'label' and 'body_list':

import pandas as pd

fullCorpus = pd.DataFrame({        
    'label': labelList,                # a dictionary with keys = labels, values = test
    'body_list': textList
})

fullCorpus.head()

ValueError: All arrays must be of the same length

In [15]:
# to see why I am getting the error above, let's check the length of each array as follows:

print(len(labelList))    
print(len(textList))

5571
5570


In [16]:
# Let's print the last five items of the label list to see what's up:

print(labelList[-5:])    # this shows that the last item is a blank, which we can drop

['ham', 'ham', 'ham', 'ham', '']


In [17]:
# Let's repeat the code that gave us an error, but by dropping the last item from the label list and it works!
fullCorpus = pd.DataFrame({
    'label': labelList[:-1],
    'body_list': textList
})

fullCorpus.head()

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [18]:
# Note:  we could have noticed that \t in the original data meant it was tab delimited, so this would have been easier:

dataset = pd.read_csv("SMSSpamCollection.tsv", sep="\t", header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
