In [25]:
import numpy as np
import nltk
import re

In [67]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Archel\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [68]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Archel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [44]:
from nltk.corpus import twitter_samples, stopwords
from nltk.tokenize import TweetTokenizer

In [8]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [10]:
test_pos = pos_tweets[4000:]
train_pos = pos_tweets[:4000]
test_neg = neg_tweets[4000:]
train_neg = neg_tweets[:4000]

x_train = train_pos + train_neg
x_test = test_pos + test_neg

In [20]:
y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)))
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)))

In [21]:
print(y_train.shape)
print(y_test.shape)

(8000,)
(2000,)


In [48]:
def preprocess_tweet(tweet):
    
    # copy over the tweet input
    new_tweet = tweet
    # remove links and URLS
    new_tweet = re.sub(r'[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', tweet)
    # remove hashtags
    new_tweet = re.sub(r'#', '', new_tweet)
    # remove old style retweets
    new_tweet = re.sub(r'^RT[\s]+', '', new_tweet)
    
    stemmer = PorterStemmer()
    eng_stopwords = stopwords.words('english')
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    
    tweet_tokens = tokenizer.tokenize(new_tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if word in eng_stopwords:
            continue
        stem = stemmer.stem(word)
        stem = stem.lower()
        
        tweets_clean.append(stem)
    
    return tweets_clean

In [49]:
preprocess_tweet(x_train[0])

['followfriday', 'top', 'engag', 'member', 'commun', 'week', ':)']

In [61]:
def build_freqs(features, labels):
    
    freq = {}
    
    for (x, y) in zip(features, labels):
        x = preprocess_tweet(x)
        for word in x:
            pair = (word, y)

            if pair not in freq:
                freq[pair] = 1
            else:
                freq[pair] += 1

    return freq

In [65]:
build_freqs(x_train[:10], y_train[:10])

{('followfriday', 1.0): 1,
 ('top', 1.0): 1,
 ('engag', 1.0): 1,
 ('member', 1.0): 1,
 ('commun', 1.0): 1,
 ('week', 1.0): 1,
 (':)', 1.0): 8,
 ('hey', 1.0): 1,
 ('jame', 1.0): 1,
 ('!', 1.0): 9,
 ('odd', 1.0): 1,
 (':/', 1.0): 1,
 ('pleas', 1.0): 1,
 ('call', 1.0): 2,
 ('contact', 1.0): 1,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 1,
 ('assist', 1.0): 1,
 ('mani', 1.0): 1,
 ('thank', 1.0): 1,
 ('listen', 1.0): 1,
 ('last', 1.0): 1,
 ('night', 1.0): 1,
 ('bleed', 1.0): 1,
 ('amaz', 1.0): 1,
 ('track', 1.0): 1,
 ('.', 1.0): 5,
 ('scotland', 1.0): 1,
 ('?', 1.0): 1,
 ('congrat', 1.0): 1,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 1,
 ('verifi', 1.0): 1,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 1,
 ('blue', 1.0): 1,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 1,
 ('profil', 1.0): 1,
 ('15', 1.0): 1,
 ('day', 1.0): 1,
 ('one', 1.0): 1,
 ('irresist', 1.0): 1,
 ('flipkartfashionfriday', 1.0): 1,
 ('like', 1.0): 1,
 ('keep', 1.0): 1,
 