# NLP Basics: Reading in text data & why do we need to clean the text?

### Read in semi-structured text data

In [1]:
# Read in the raw text
# from UCI Machine Learning Repository
rawData = open("SMSSpamCollection.tsv").read()

# Print the raw data (first 500 characters)
rawData[0:500]

"ham\tI've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.\nspam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's\nham\tNah I don't think he goes to usf, he lives around here though\nham\tEven my brother is not like to speak with me. They treat me like aid"

In [2]:
# replace \n with \t then split it into a list
parsedData = rawData.replace('\t', '\n').split('\n')

In [3]:
# print out the first 5 elements of the list
parsedData[0:5]

['ham',
 "I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.",
 'spam',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'ham']

In [4]:
# grabbing all 'ham' and 'spam' labels
labelList = parsedData[0::2]

# grabbing all text content
textList = parsedData[1::2]

In [5]:
# checking the first 5 elements of labelList and textList
print(labelList[0:5])
print(textList[0:5])

['ham', 'spam', 'ham', 'ham', 'ham']
["I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times.", "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", "Nah I don't think he goes to usf, he lives around here though", 'Even my brother is not like to speak with me. They treat me like aids patent.', 'I HAVE A DATE ON SUNDAY WITH WILL!!']


### Doing further analysis in a DataFrame

In [6]:
# creating a DataFrame
import pandas as pd

fullCorpus = pd.DataFrame({
    'label': labelList,
    'body_list': textList
})

fullCorpus.head()

ValueError: All arrays must be of the same length

In [7]:
print(len(labelList))
print(len(textList))

5571
5570


In [8]:
# print the last 5 elements of labelList to check what's wrong
print(labelList[-5:])

['ham', 'ham', 'ham', 'ham', '']


In [9]:
# attempting again to create a DataFrame but drop the last item in labelList
fullCorpus = pd.DataFrame({
    'label': labelList[:-1],
    'body_list': textList
})

fullCorpus.head()

Unnamed: 0,label,body_list
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Shortcut in reading the data

In [11]:
dataset = pd.read_csv("SMSSpamCollection.tsv", sep='\t', header=None)
dataset.head()

Unnamed: 0,0,1
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!
