In [4]:
import re

corpus = [
    ("I am happy because I am learning NLP", 1),
    ("I am happy", 1),
    ('"I am sad, I am not learning NLP"', 0),
    ("I am sad", 0)
]

# Tokenization and extracting unique words
vocabulary = set()
for text, _ in corpus:  # Changed to unpack only the text, ignoring the label
    # Tokenize text using regular expression
    words = re.findall(r'\b\w+\b', text.lower())  # Access text and then apply lower()
    vocabulary.update(words)

# Convert set to list for easier indexing
vocabulary = list(vocabulary)

print("Vocabulary V:", vocabulary)


Vocabulary V: ['because', 'learning', 'nlp', 'happy', 'sad', 'i', 'not', 'am']


In [5]:
positive_corpus = []
negative_corpus = []

for text, label in corpus:
    if label == 1:
        positive_corpus.append(text)
    else:
        negative_corpus.append(text)

print("Positive Tweet Corpus:")
for tweet in positive_corpus:
    print(tweet)

print("\nNegative Tweet Corpus:")
for tweet in negative_corpus:
    print(tweet)

Positive Tweet Corpus:
I am happy because I am learning NLP
I am happy

Negative Tweet Corpus:
"I am sad, I am not learning NLP"
I am sad


In [6]:
# Vocabulary from previous step


positive_frequency = {word: 0 for word in vocabulary}
negative_frequency = {word: 0 for word in vocabulary}

# Counting positive word frequencies
for tweet in positive_corpus:
    words = re.findall(r'\b\w+\b', tweet.lower())
    for word in words:
        if word in positive_frequency:
            positive_frequency[word] += 1

# Counting negative word frequencies
for tweet in negative_corpus:
    words = re.findall(r'\b\w+\b', tweet.lower())
    for word in words:
        if word in negative_frequency:
            negative_frequency[word] += 1

print("Positive Frequency:")
print(positive_frequency)
print("\nNegative Frequency:")
print(negative_frequency)


Positive Frequency:
{'because': 1, 'learning': 1, 'nlp': 1, 'happy': 2, 'sad': 0, 'i': 3, 'not': 0, 'am': 3}

Negative Frequency:
{'because': 0, 'learning': 1, 'nlp': 1, 'happy': 0, 'sad': 2, 'i': 3, 'not': 1, 'am': 3}


In [7]:
def encode_tweet(tweet, positive_freq, negative_freq):
    words = re.findall(r'\b\w+\b', tweet.lower())
    positive_sum = sum(positive_freq.get(word, 0) for word in words)
    negative_sum = sum(negative_freq.get(word, 0) for word in words)
    return [1, positive_sum, negative_sum]

# Example usage:
tweet = "I am happy because I am learning NLP"
encoded_vector = encode_tweet(tweet, positive_frequency, negative_frequency)
print("Encoded Vector:", encoded_vector)


Encoded Vector: [1, 17, 14]


In [8]:
import numpy as np

def build_freqs(tweets, labels):
    freqs = {}
    for tweet, label in zip(tweets, labels):
        words = re.findall(r'\b\w+\b', tweet.lower())
        for word in words:
            pair = (word, label)
            freqs[pair] = freqs.get(pair, 0) + 1
    return freqs

def process_tweet(tweet):
    return re.findall(r'\b\w+\b', tweet.lower())

def extract_features(tweet, freqs):
    positive_sum = 0
    negative_sum = 0
    for word in tweet:
        positive_sum += freqs.get((word, 1), 0)
        negative_sum += freqs.get((word, 0), 0)
    return [1, positive_sum, negative_sum]

# Example data
tweets = [
    "I am happy because I am learning NLP",
    "I am sad, I am not learning NLP",
    "I am happy"
]

labels = [1, 0, 1]

# Build frequencies dictionary
freqs = build_freqs(tweets, labels)

# Initialize matrix X
m = len(tweets)
X = np.zeros((m, 3))

# Extract features for each tweet
for i in range(m):
    p_tweet = process_tweet(tweets[i])
    X[i, :] = extract_features(p_tweet, freqs)

print("Matrix X:")
print(X)


Matrix X:
[[ 1. 17. 10.]
 [ 1. 14. 12.]
 [ 1.  8.  4.]]


In [9]:
import numpy as np

def sigmoid(z):
    """Sigmoid function."""
    return 1 / (1 + np.exp(-z))

def cost_function(theta, X, y):
    """Compute the cost function."""
    m = len(y)
    h = sigmoid(np.dot(X, theta))
    J = -1/m * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
    return J

def gradient_descent(theta, X, y, alpha, num_iterations):
    """Perform gradient descent to optimize theta."""
    m = len(y)
    J_history = []

    for _ in range(num_iterations):
        h = sigmoid(np.dot(X, theta))
        gradient = np.dot(X.T, (h - y)) / m
        theta -= alpha * gradient
        J_history.append(cost_function(theta, X, y))

    return theta, J_history

# Example usage:
# Initialize theta with zeros
theta = np.zeros(X.shape[1])

# Set hyperparameters
alpha = 0.01
num_iterations = 1000

# Define labels
y = np.array(labels)

# Perform gradient descent
theta_optimized, J_history = gradient_descent(theta, X, y, alpha, num_iterations)

# Print optimized theta and cost history
print("Optimized theta:", theta_optimized)
print("Final cost:", J_history[-1])

# You can use the optimized theta to make predictions on new data.
# For example, to predict the label for a new tweet:
new_tweet = "I am happy because I am learning NLP"
encoded_vector = encode_tweet(new_tweet, positive_frequency, negative_frequency)
prediction = sigmoid(np.dot(encoded_vector, theta_optimized))
print("Prediction:", prediction)


Optimized theta: [ 0.20345546  1.13460441 -1.58399181]
Final cost: 0.043147541252030866
Prediction: 0.0639148397755089


In [11]:
# Example usage:
# Initialize theta with zeros
theta = np.zeros(X.shape[1])

# Set hyperparameters
alpha = 0.01
num_iterations = 1000

# Define labels
y = np.array(labels)

# Perform gradient descent
theta_optimized, J_history = gradient_descent(theta, X, y, alpha, num_iterations)

new_tweet = "I am happy because I am learning NLP"
encoded_vector = encode_tweet(new_tweet, positive_frequency, negative_frequency)
hypothesis = sigmoid(np.dot(encoded_vector, theta_optimized))
print("Prediction:", hypothesis)

# Print hypothesis, parameters, and features as arrays
print("Hypothesis (sigmoid(np.dot(X, theta_optimized))):", sigmoid(np.dot(X, theta_optimized)))
print("Parameters (theta_optimized):", theta_optimized)
print("Features (X):", X)


Prediction: 0.0639148397755089
Hypothesis (sigmoid(np.dot(X, theta_optimized))): [0.97471201 0.05117445 0.94999451]
Parameters (theta_optimized): [ 0.20345546  1.13460441 -1.58399181]
Features (X): [[ 1. 17. 10.]
 [ 1. 14. 12.]
 [ 1.  8.  4.]]


In [14]:
# Determine sentiment based on probability threshold
threshold = 0.5
if prediction >= threshold:
    print("positive sentiment (label 1)")
else:
    print("negative sentiment (label 0)")

negative sentiment (label 0)
