In [1]:
import numpy as np
import re
import nltk

from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer

In [2]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

test_pos = pos_tweets[4000:]
train_pos = pos_tweets[:4000]
test_neg = neg_tweets[4000:]
train_neg = neg_tweets[:4000]

x_train = train_pos + train_neg
x_test = test_pos + test_neg

y_train = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)))
y_test = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)))

In [3]:
def preprocess_tweet(tweet):
    
    # copy over the tweet input
    new_tweet = tweet
    # remove links and URLS
    new_tweet = re.sub(r'[(http(s)?):\/\/(www\.)?a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', '', tweet)
    # remove hashtags
    new_tweet = re.sub(r'#', '', new_tweet)
    # remove old style retweets
    new_tweet = re.sub(r'^RT[\s]+', '', new_tweet)
    
    stemmer = PorterStemmer()
    eng_stopwords = stopwords.words('english')
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    
    tweet_tokens = tokenizer.tokenize(new_tweet)
    
    tweets_clean = []
    for word in tweet_tokens:
        if word in eng_stopwords:
            continue
        stem = stemmer.stem(word)
        stem = stem.lower()
        
        tweets_clean.append(stem)
    
    return tweets_clean

In [4]:
def build_freqs(features, labels):
    
    freq = {}
    
    for (x, y) in zip(features, labels):
        x = preprocess_tweet(x)
        for word in x:
            pair = (word, y)

            if pair not in freq:
                freq[pair] = 1
            else:
                freq[pair] += 1

    return freq

In [5]:
def sigmoid(z):
    
    return 1 / (1 + np.exp(-z))

In [6]:
sigmoid(1)

0.7310585786300049

In [17]:
def gradient_descent(X, y, theta, alpha, num_iter):
    
    m = len(X)
    
    for i in range(num_iter):
        
        z = X.dot(theta)
        h = sigmoid(z)
        J = (-1/m) * ((y.T.dot(np.log(h))) + ((1-y).T.dot(np.log(1-h))))
        
        theta = theta - (alpha/m) * (X.T.dot(h-y))
    
    
    return J, theta

In [18]:
# Check the function
# Construct a synthetic test case using numpy PRNG functions
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradient_descent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is ", tmp_J)
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

The cost after training is  [[0.6709497]]
The resulting vector of weights is [4.1e-07, 0.00035658, 7.309e-05]


In [19]:
def extract_features(tweet, freqs):
    
    words = preprocess_tweet(tweet)
    
    x = np.zeros((1, 3))
    x[0,0] = 1 # bias
    
    for word in words:
        
        if (word, 1) in freqs:
            x[0,1] += freqs[(word, 1)]
        elif (word, 0) in freqs:
            x[0,2] += freqs[(word, 0)]
            
    return x

In [20]:
# Check your function

# test 1
# test on training data
freqs = build_freqs(x_train, y_train)
tmp1 = extract_features(x_train[0], freqs)
print(tmp1)

[[1.000e+00 3.131e+03 6.100e+01]]


In [22]:
# collect the features 'x' and stack them into a matrix 'X'
X = np.zeros((len(x_train), 3))
for i in range(len(x_train)):
    X[i, :]= extract_features(x_train[i], freqs)

# training labels corresponding to X
Y = y_train

# Apply gradient descent
J, theta = gradient_descent(X, Y, np.zeros((3, 1)), 1e-9, 1)
print(J)
print(theta)

[0.69314718]
[[ 5.00000000e-10  5.00000000e-10  5.00000000e-10 ... -5.00000000e-10
  -5.00000000e-10 -5.00000000e-10]
 [ 1.17391831e-06  1.17391831e-06  1.17391831e-06 ... -1.17391831e-06
  -1.17391831e-06 -1.17391831e-06]
 [ 1.33062950e-06  1.33062950e-06  1.33062950e-06 ... -1.33062950e-06
  -1.33062950e-06 -1.33062950e-06]]
