# S05 - Text Classification: Logistic Regression & Naive Bayes
## Exercises

### Exercise 1 (Easy)
Convert texts to Bag-of-Words representation using sklearn.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["I love this movie", "This movie is terrible", "Great film!", "Waste of time"]

# Create BoW representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
#  We get feature names, this are the unique words in our text
feature_names = vectorizer.get_feature_names_out()

# We convert BoW matrix to array and print it
# In the BoW matrix, each row corresponds to a document and each column corresponds to a unique word. 
# The values in the matrix represent the count of each word in the respective document.
bow_array = bow_matrix.toarray()
print("Feature Names:", feature_names)
print("BoW Matrix:\n", bow_array)



Feature Names: ['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
BoW Matrix:
 [[0 0 0 1 1 0 0 1 0 0]
 [0 0 1 0 1 0 1 1 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 1]]


### Exercise 2 (Easy)
Convert the same texts to TF-IDF representation.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF representation
# We have to do the same as before but with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# We get feature names, this are the unique words in our text
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# We convert TF-IDF matrix to array and print it
# In the TF-IDF matrix, each row corresponds to a document and each column corresponds to a unique word.
# The values in the matrix represent the TF-IDF score of each word in the respective document.
tfidf_array = tfidf_matrix.toarray()
print("TF-IDF Feature Names:", tfidf_feature_names)
print("TF-IDF Matrix:\n", tfidf_array)



TF-IDF Feature Names: ['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
TF-IDF Matrix:
 [[0.         0.         0.         0.66767854 0.52640543 0.
  0.         0.52640543 0.         0.        ]
 [0.         0.         0.55528266 0.         0.43779123 0.
  0.55528266 0.43779123 0.         0.        ]
 [0.70710678 0.70710678 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.57735027]]


### Exercise 3 (Medium)
Train a Naive Bayes classifier for sentiment analysis.

In [8]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

texts = ["I love this movie", "Great film", "Excellent acting", "Best movie ever",
         "Terrible movie", "Waste of time", "Awful acting", "Worst film"]
labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1=positive, 0=negative

# Train Naive Bayes classifier
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(texts)
y = labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
model = MultinomialNB()
model.fit(X_train, y_train)

# We predict the labels for the test set and print them
y_pred = model.predict(X_test)
print("Predicted labels:", y_pred)

# We print the classification report
print("Classification Report:\n", classification_report(y_test, y_pred))


Predicted labels: [0 0]
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Exercise 4 (Medium)
Train a Logistic Regression classifier and compare with Naive Bayes.

In [9]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression and compare accuracy with Naive Bayes
# We have to do the same as before but with LogisticRegression in this case
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_pred = logistic_model.predict(X_test)
print("Logistic Regression Predicted labels:", logistic_pred)
print("Logistic Regression Classification Report:\n", classification_report(y_test, logistic_pred))


Logistic Regression Predicted labels: [0 0]
Logistic Regression Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


### Exercise 5 (Hard - Research)
Implement Naive Bayes from scratch (without sklearn) for text classification.

*Hint: Use log probabilities to avoid underflow. Research: P(c|d) ∝ P(c) × Π P(w|c)*

In [11]:
# Implement Naive Bayes from scratch (without sklearn) for text classification.
# Hint: Use log probabilities to avoid underflow. Research: P(c|d) ∝ P(c) × Π P(w|c)*

import math
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = defaultdict(dict)
    
    def fit(self, texts, labels):
        # We calculate class probabilities
        label_counts = Counter(labels)
        total_count = len(labels)
        self.class_probs = {label: count / total_count for label, count in label_counts.items()}
        
        # We calculate word probabilities for each class
        word_counts = defaultdict(Counter)
        for text, label in zip(texts, labels):
            words = text.split()
            word_counts[label].update(words)
        
        for label, counts in word_counts.items():
            total_words = sum(counts.values())
            self.word_probs[label] = {word: (count + 1) / (total_words + len(counts)) for word, count in counts.items()}

    
    def predict(self, text):
        words = text.split()
        class_scores = {}
        for label in self.class_probs:
            # We start with the log of the class probability
            class_scores[label] = math.log(self.class_probs[label])
            for word in words:
                # We add the log of the word probability, using Laplace smoothing
                class_scores[label] += math.log(self.word_probs[label].get(word, 1 / (sum(self.word_probs[label].values()) + len(self.word_probs[label]))))
        # We return the class with the highest score
        return max(class_scores, key=class_scores.get)

# We test our Naive Bayes implementation
nb_classifier = NaiveBayesClassifier()
nb_classifier.fit(texts, labels)
test_text = "I love this film"
predicted_label = nb_classifier.predict(test_text)
print(f"Predicted label for '{test_text}': {predicted_label}")

# We evaluate our Naive Bayes implementation on the test set
predicted_labels = [nb_classifier.predict(text) for text in texts]
print("Classification Report:\n", classification_report(labels, predicted_labels))


Predicted label for 'I love this film': 0
Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.75      0.67         4
           1       0.67      0.50      0.57         4

    accuracy                           0.62         8
   macro avg       0.63      0.62      0.62         8
weighted avg       0.63      0.62      0.62         8

