# NLP (Natural Language Processing) with Python

### Imports

In [None]:
!pip install nltk

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

## Data

In [None]:
messages = [line.rstrip() for line in open('/home/jovyan/datafabric/tutorial/spam_utf8.csv')] #please update the path.
print(len(messages))

In [None]:
for message_no, message in enumerate(messages[:3]):
    print(message_no, message)
    print('\n')

In [None]:
messages = pd.read_csv('/home/jovyan/datafabric/tutorial/spam_utf8.csv', sep=',', #please update the path.
                           names=["label", "message", "v3", "v4", "v5"])
messages.head()

## Exploratory Data Analysis

In [None]:
messages.describe()

In [None]:
messages.groupby('label').describe()

In [None]:
messages['length'] = messages['message'].apply(len)
messages.head()

In [None]:
messages

### Data Visualization

In [None]:
messages['length'].plot(bins=50, kind='hist');

In [None]:
messages.length.describe()

In [None]:
messages.hist(column='length', by='label', bins=50,figsize=(12,4));

## Text Pre-processing

In [None]:
mess = 'Sample message! Notice: it has punctuation.'

# Check characters to see if they include punctuation.
nopunc = [char for char in mess if char not in string.punctuation]

# Join the characters again to form the string.
nopunc = ''.join(nopunc)

In [None]:
stopwords.words('english')[0:10] # Show some stop words

In [None]:
nopunc.split()

In [None]:
# Now just remove any stopwords
clean_mess = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
clean_mess

In [None]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they include punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [None]:
messages.head()

In [None]:
# Check to make sure it's working
messages['message'].head(5).apply(text_process)

In [None]:
# Show original dataframe
messages.head()

## Vectorization

In [None]:
# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(messages['message'])

# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

In [None]:
message4 = messages['message'][3]
print(message4)

In [None]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

In [None]:
print(bow_transformer.get_feature_names_out()[4073])
print(bow_transformer.get_feature_names_out()[9570])

In [None]:
messages_bow = bow_transformer.transform(messages['message'])

In [None]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

In [None]:
sparsity = (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1]))
print('sparsity: {}'.format(round(sparsity)))

In [None]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

In [None]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['u']])
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['university']])

In [None]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

## Training a model

In [None]:
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

In [None]:
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', messages.label[3])

## Model Evaluation

In [None]:
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)

In [None]:
print (classification_report(messages['label'], all_predictions))

## Train Test Split

In [None]:
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['message'], messages['label'], test_size=0.2)

print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

## Creating a Data Pipeline

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [None]:
pipeline.fit(msg_train,label_train)

In [None]:
predictions = pipeline.predict(msg_test)

In [None]:
print(classification_report(predictions,label_test))