In [48]:
import nltk                                           # Python library for NLP
#nltk.download('twitter_samples')
#nltk.download('stopwords')
from nltk.corpus import twitter_samples, stopwords    # sample Twitter dataset from NLTK and module for stopwords
from nltk.stem import PorterStemmer                   # module for stemming
from nltk.tokenize import TweetTokenizer              # module for tokenizing strings

import re                                             # library for regular expression operations
import string                                         # for string operations

In [49]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')
tweet = pos_tweets[0]

In [50]:
'''
Remove hyperlinks, Twitter marks and styles
'''
print('\033[92m' + tweet)
print('\033[94m')

# remove old style retweet text "RT"
tweet2 = re.sub(r'^RT[\s]+', '', tweet)

# remove hyperlinks
tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)

# remove hashtags
# only removing the hash # sign from the word
tweet2 = re.sub(r'#', '', tweet2)

print(tweet2)

[92m#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
[94m
FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)


In [51]:
'''
Tokenize, remove handles and convert to lowercase
'''
print()
print('\033[92m' + tweet2)
print('\033[94m')

# instantiate tokenizer class
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)

# tokenize tweets
tweet_tokens = tokenizer.tokenize(tweet2)

print()
print('Tokenized string:')
print(tweet_tokens)


[92mFollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)
[94m

Tokenized string:
['followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']


In [52]:
'''
Remove stopwords and Punctuations
'''
stopwords_english = stopwords.words('english')
print('\033[92m')
print(tweet_tokens)
print('\033[94m')

tweets_clean = []

for word in tweet_tokens: # Go through every word in your tokens list
    if (word not in stopwords_english and  # remove stopwords
        word not in string.punctuation):  # remove punctuation
        tweets_clean.append(word)

print('removed stop words and punctuation:')
print(tweets_clean)


[92m
['followfriday', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
[94m
removed stop words and punctuation:
['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']


In [53]:
'''
Stemming (Porter Stemmer)
'''
print('\033[92m')
print(tweets_clean)
print('\033[94m')

# Instantiate stemming class
stemmer = PorterStemmer() 

# Create an empty list to store the stems
tweets_stem = [] 

for word in tweets_clean:
    stem_word = stemmer.stem(word)  # stemming word
    tweets_stem.append(stem_word)  # append to the list

print('stemmed words:')
print(tweets_stem)

[92m
['followfriday', 'top', 'engaged', 'members', 'community', 'week', ':)']
[94m
stemmed words:
['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']
