# NLP on Un Structured Data 

In [1]:
import nltk
from nltk.tokenize import *
from nltk.stem import PorterStemmer
import pandas as pd
import string as st

In [2]:
text ='My name is Muhammd Ahmad Khan'
tokens=word_tokenize(text)
print(tokens)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\Azam/nltk_data'
    - 'c:\\users\\azam\\appdata\\local\\programs\\python\\python39\\nltk_data'
    - 'c:\\users\\azam\\appdata\\local\\programs\\python\\python39\\share\\nltk_data'
    - 'c:\\users\\azam\\appdata\\local\\programs\\python\\python39\\lib\\nltk_data'
    - 'C:\\Users\\Azam\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


# 1. Data Structuring

In [None]:
raw_data = open('SMSSpamCollection').read() #getting raw data from file using Open Method
#Its time for Structuring the data
parsed_data=raw_data.replace('\t','\n').split('\n') #replacing every '\t' with '\n' and splitting this into array on every '\n'

#Splitting Tags in different List and messages in Different List
tags_list=parsed_data[0::2]
msgs_list=parsed_data[1::2]

In [None]:
final_data = pd.DataFrame({
    'label':tags_list[:-1],
    'message':msgs_list
})
final_data.head()

In [None]:
#Data Structuring using Pandas 
dataset = pd.read_csv('SMSSpamCollection', sep='\t', header=None)
dataset.columns=['label','message']
dataset.head()

# 2. Data Exploration

In [None]:
#Shape of the Data
print(f'Input DataSet has {len(dataset)} rows, and {len(dataset.columns)} columns') 

In [None]:
#Finding How many spam and ham there
print(f'ham = {len(dataset[dataset["label"]=="ham"])}')
print(f'spam = {len(dataset[dataset["label"]=="spam"])}')

In [None]:
#Missing Data
print(f"Number of Missing Labels = {dataset['label'].isnull().sum()}")
print(f"Number of Missing Messages = {dataset['message'].isnull().sum()}")

# 3. Data Pre Processing

In [None]:
pd.set_option('display.max_colwidth',100)
dataset.head()

# 1st Step : Removing Punctuation

In [None]:
st.punctuation #These are all Punctuations

In [None]:
#Method to Remove Punctuation from A Word
def remove_punctuation(word):
    word_nopunc = "".join([c for c in word if c not in st.punctuation])
    return word_nopunc

In [None]:
#Applying Pronunciation Cleaning method on Messages Column
dataset['cleaned_message'] = dataset['message'].apply(lambda x : remove_punctuation(x)) 
dataset.head()

# 2nd Step : Tokenization

In [None]:
#using Word_Tokenize() Method to Tokenize all the Messages
dataset['cleaned_message_tokens'] = dataset['cleaned_message'].apply(lambda x: word_tokenize(x))
dataset.head()

# 3rd Step : Removing Stop Words

In [None]:
#These are all Stopping words from NLTK Library
stop_words=nltk.corpus.stopwords.words('english')
stop_words[0:10]

In [None]:
#Function to remove Stopping Words
def remove_stopwords(array_of_tokens):
    final_array = [word for word in array_of_tokens if word not in stop_words]
    return final_array

In [None]:
#Applying Function on [cleaned_message_tokens]
dataset['message_tokens_with_no_stopwords'] = dataset['cleaned_message_tokens'].apply(lambda x : remove_stopwords(x))
dataset.head()

# 4th Step : Stemming

In [None]:
#Stemming Variable
ps = PorterStemmer()

In [None]:
#Function that Perform Stemming
def stemming(message_tokens_with_no_stopwords):
    stemmed_words= [ps.stem(word) for word in message_tokens_with_no_stopwords]
    return stemmed_words

In [None]:
#Applying Function on [message_tokens_with_no_stopwords]
dataset['stemmed_words'] = dataset['message_tokens_with_no_stopwords'].apply(lambda x : stemming(x))
dataset.head()

# 5th Step : Lemmatization

In [None]:
#Using WordNet Lemmatizer
wn = nltk.WordNetLemmatizer()

In [None]:
#Function that Perform Lemmatization
def lemmatization(message_tokens_with_no_stopwords):
    lemmatize_words= [wn.lemmatize(word) for word in message_tokens_with_no_stopwords]
    return lemmatize_words

In [None]:
#Applying Function on [message_tokens_with_no_stopwords]
dataset['lemmatize_words'] = dataset['message_tokens_with_no_stopwords'].apply(lambda x : lemmatization(x))
dataset.head()