# 3 NLP packages
* NLTK
* spaCy
* gensim

In [2]:
import re                                  # library for regular expression operations
import string                              # for string operations

import nltk                                # Python library for NLP
from nltk.corpus import twitter_samples    # sample Twitter dataset from NLTK
from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import TweetTokenizer   # module for tokenizing strings

import numpy as np 
import random                              # pseudo-random number generator
import matplotlib.pyplot as plt            # library for visualization

In [13]:
print(nltk.__version__)

3.4.5


In [3]:
 # twitter data
 nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /home/nejada/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [4]:
# download the stopwords from NLTK
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/nejada/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
# select the set of positive and negative tweets
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

## Processing Tweets

In [6]:
def process_tweet(tweet):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    # remove stock market tickers like $GE
    tweet = re.sub(r'\$\w*', '', tweet)
    # remove old style retweet text "RT"
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    # remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    # remove hashtags
    # only removing the hash # sign from the word
    tweet = re.sub(r'#', '', tweet)
    # tokenize tweets
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)

    tweets_clean = []
    for word in tweet_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            stem_word = stemmer.stem(word)  # stemming word
            tweets_clean.append(stem_word)

    return tweets_clean

In [7]:
tweet=random.choice(all_positive_tweets)
print(tweet)
process_tweet(tweet)

I haven't seen that many 'menacing' since I finished JoJo :D


['seen', 'mani', 'menac', 'sinc', 'finish', 'jojo', ':D']

***
# Stemming Overview:

Stemming is a method of normalization of words in Natural Language Processing. It is a technique in which a set of words in a sentence are converted into a sequence to shorten its lookup. In this method, the words having the same meaning but have some variations according to the context or sentence are normalized.

## Stemming Algorithms
* Rule based (Truncating)
    1. Lovins
    2. <u>Porter</u>
    3. Paice/Husk
    4. Dawson
    5. <u>Snowball</u>
    6. Lancaster
* Statistical
    1. N-Gram
    2. HMM
    3. YASS
* Mixed
    1. Krovetz
    2. Xerox
    3. Corpus based
    4. Context sensitive 

## Two basic stemming types

* <i>Dictionary-based</i>: uses lists of related words
* <i>Algorithmic</i>: uses program to determine related words

*Algorithmic stemmers:

    * suffix-s: remove ‘s’ endings assuming plural
         * e.g., cats ! cat, lakes ! lake, wiis ! wii
         * Many false positives: supplies ! supplie, ups ! up
         * Some false negatives: mice " mice (should be mouse)

## ```NLTK``` Stemming packages:
Following packages are available in ```nltk```:
* ```arlstem```: light Arabic stemmer
* ```cistem```: Stemmer for German
* ```isri```: Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary
* ``` lancaster```: A word stemmer based on the Lancaster stemming algorithm
* ```porter```: Porter stemming algorithm
* ```regexp```: A stemmer that uses regular expressions to identify morphological affixes. Any substrings that match the regular expressions will be removed
* ```rslp```: A stemmer for Portuguese
* ```snowball```: This module provides a port of the Snowball stemmers developed by Martin Porter.
* ```wordnet```: <b>Lemmatize</b> using WordNet’s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.

## ```GenSim``` Stemming packages:
* ```porter```: Porter stemming 


## ```spaCy``` Stemming packages:
SpaCy doesn’t have a stemming library as they prefer lemmatization over stemmer while NLTK has both stemmer and lemmatizer

## Porter Stemmer Algorithm
* Algorithmic stemmer used in the information retrieval (IR) experiments since the 70s
* Consists of a series of rules designed to strip off the longest possible suffix at each step
* Produces stems not words
* Makes a number of errors and difficult to modify 

In [11]:
tokens = ['compute', 'computer', 'computed', 'computing']

In [12]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
for token in tokens:
    print(token + ' --> ' + stemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


## Snowball Stemmer Algorithm
Snowball is a small string processing language for creating stemming algorithms for use in Information Retrieval, plus a collection of stemming algorithms implemented using it.
It was originally designed and built by Martin Porter therefore it is also known as the Porter2 stemming algorithm as it is a better version of the Porter Stemmer since some issues of it were fixed in this stemmer.

In [19]:
from nltk.stem.snowball import SnowballStemmer
snow=SnowballStemmer(language='english')
for token in tokens:
    print(token + ' --> ' + snow.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


***

## Building Frequencies 

In [8]:
def build_freqs(tweets, ys):
    """Build frequencies.
    Input:
        tweets: a list of tweets
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0 or 1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweet(tweet):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [9]:
# make a numpy array representing labels of the tweets
labels = np.append(np.ones((len(all_positive_tweets))), np.zeros((len(all_negative_tweets))))
# concatenate the lists, 1st part is the positive tweets followed by the negative
tweets = all_positive_tweets + all_negative_tweets

In [10]:
# create frequency dictionary
freqs = build_freqs(tweets, labels)

In [11]:
freqs[('twice', 0.0)]

5

### Train/Test Split 

In [12]:
# split the data into two pieces, one for training and one for testing (validation set)
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg
test_x = test_pos + test_neg

# avoid assumptions about the length of all_positive_tweets
train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))