# S05 - Text Classification: Logistic Regression & Naive Bayes
## Exercises

### Exercise 1 (Easy)
Convert texts to Bag-of-Words representation using sklearn.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts = ["I love this movie", "This movie is terrible", "Great film!", "Waste of time"]

# Create BoW representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform(texts)
#  We get feature names, this are the unique words in our text
feature_names = vectorizer.get_feature_names_out()

# We convert BoW matrix to array and print it
# In the BoW matrix, each row corresponds to a document and each column corresponds to a unique word. 
# The values in the matrix represent the count of each word in the respective document.
bow_array = bow_matrix.toarray()
print("Feature Names:", feature_names)
print("BoW Matrix:\n", bow_array)



Feature Names: ['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
BoW Matrix:
 [[0 0 0 1 1 0 0 1 0 0]
 [0 0 1 0 1 0 1 1 0 0]
 [1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 1 1]]


### Exercise 2 (Easy)
Convert the same texts to TF-IDF representation.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF representation
# We have to do the same as before but with TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(texts)
# We get feature names, this are the unique words in our text
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# We convert TF-IDF matrix to array and print it
# In the TF-IDF matrix, each row corresponds to a document and each column corresponds to a unique word.
# The values in the matrix represent the TF-IDF score of each word in the respective document.
tfidf_array = tfidf_matrix.toarray()
print("TF-IDF Feature Names:", tfidf_feature_names)
print("TF-IDF Matrix:\n", tfidf_array)



TF-IDF Feature Names: ['film' 'great' 'is' 'love' 'movie' 'of' 'terrible' 'this' 'time' 'waste']
TF-IDF Matrix:
 [[0.         0.         0.         0.66767854 0.52640543 0.
  0.         0.52640543 0.         0.        ]
 [0.         0.         0.55528266 0.         0.43779123 0.
  0.55528266 0.43779123 0.         0.        ]
 [0.70710678 0.70710678 0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.57735027
  0.         0.         0.57735027 0.57735027]]


### Exercise 3 (Medium)
Train a Naive Bayes classifier for sentiment analysis.

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

texts = ["I love this movie", "Great film", "Excellent acting", "Best movie ever",
         "Terrible movie", "Waste of time", "Awful acting", "Worst film"]
labels = [1, 1, 1, 1, 0, 0, 0, 0]  # 1=positive, 0=negative

# Train Naive Bayes classifier


### Exercise 4 (Medium)
Train a Logistic Regression classifier and compare with Naive Bayes.

In [None]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression and compare accuracy with Naive Bayes


### Exercise 5 (Hard - Research)
Implement Naive Bayes from scratch (without sklearn) for text classification.

*Hint: Use log probabilities to avoid underflow. Research: P(c|d) ∝ P(c) × Π P(w|c)*

In [None]:
import math
from collections import defaultdict, Counter

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = {}
        self.word_probs = defaultdict(dict)
    
    def fit(self, texts, labels):
        # Your implementation
        pass
    
    def predict(self, text):
        # Your implementation
        pass
