## Text Classification

### Program 1. Basic Text Classification using scikit-learn

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

texts = [
    "I love this movie",
    "This film was excellent",
    "I hate this movie",
    "This film was terrible",
    "Amazing acting and story",
    "Worst movie ever",
]

labels = [
    "positive",
    "positive",
    "negative",
    "negative",
    "positive",
    "negative"
]

model = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("classifier", MultinomialNB())
])

model.fit(texts, labels)

text_sentences = [
    "The movie was amazing",
    "I hated the film"
]

predictions = model.predict(text_sentences)

for sentence, label in zip(text_sentences, predictions):
    print(f"Text: {sentence} => Predicted class: {label}")

Text: The movie was amazing => Predicted class: positive
Text: I hated the film => Predicted class: negative


### Program 2: Probabilistic Text classification using scikit-learn

In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

categories = [
    "rec.sport.baseball",
    "sci.space",
    "talk.politics.misc"
]

train_data = fetch_20newsgroups(subset="train", categories=categories)

test_data = fetch_20newsgroups(subset="test", categories=categories)

In [None]:
model = Pipeline([
    ("vectorizer", TfidfVectorizer(stop_words="english")),
    ("classifier", MultinomialNB())
])

model.fit(train_data.data, train_data.target)

predictions = model.predict(test_data.data)

accuracy = accuracy_score(test_data.target, predictions)

print("Accuracy:", accuracy)
print("Classification Report:\n")
print(classification_report(test_data.target, predictions, target_names=train_data.target_names))

sample_texts = [
    "The spacecraft was launched into orbit",
    "The baseball team won the championship"
]

sample_predictions = model.predict(sample_texts)

print("Custom predictions:")
for text, label in zip(sample_texts, sample_predictions):
    print(f"Text: {text}")
    print(f"Predicted category: {train_data.target_names[label]}\n")

Accuracy: 0.9564032697547684
Classification Report:

                    precision    recall  f1-score   support

rec.sport.baseball       0.97      0.99      0.98       397
         sci.space       0.93      0.98      0.96       394
talk.politics.misc       0.98      0.88      0.93       310

          accuracy                           0.96      1101
         macro avg       0.96      0.95      0.95      1101
      weighted avg       0.96      0.96      0.96      1101

Custom predictions:
Text: The spacecraft was launched into orbit
Predicted category: sci.space

Text: The baseball team won the championship
Predicted category: rec.sport.baseball

