# Text Classification with 20 Newsgroups Dataset

We will use the 20 Newsgroups dataset, which contains approximately 20,000 newsgroup documents, partitioned across 20 different newsgroups. 
The dataset is available in the sklearn.datasets module of Scikit-learn.

## Setup

In [None]:
!pip install pandas scikit-learn nltk matplotlib seaborn

In [None]:
# Import libraries
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
# Suppress the warnings
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

## Load the data

In [None]:
# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

In [None]:
# Download NLTK stopwords
nltk.download('stopwords')
stop_words = stopwords.words('english')

## Text Preprocessing

In [None]:
# Text preprocessing function
def preprocess_text(text):
    text = text.lower() # Lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove special characters and numbers
    stop_words = set(stopwords.words('english')) # Tokenize and remove stop words
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words] # Join tokens back to a single string
    return ' '.join(tokens)

In [None]:
# Apply preprocessing
newsgroups_data = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})
newsgroups_data['text'] = newsgroups_data['text'].apply(preprocess_text)

## Split Data into Train and Test Sets

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(newsgroups_data['text'], newsgroups_data['target'], test_size=0.2, random_state=42)

## Vectorize the text

In [None]:
# Vectorize the text
vectorizer = TfidfVectorizer(stop_words=stop_words, max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## Train the model

In [None]:
# Train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)

## Evaluate the model

In [None]:
# Predict on the test set
y_pred = model.predict(X_test_vec)

In [None]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))

In [None]:
# Confusion matrix visualization
plt.figure(figsize=(10, 7))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

## Test with some new text

In [None]:
# Function to predict new text
def predict_new_text(model, vectorizer, text):
    
    text_processed = preprocess_text(text) # Preprocess the text
    
    text_vectorized = vectorizer.transform([text_processed]) # Vectorize the text
    
    prediction = model.predict(text_vectorized) # Predict the category
      
    category_name = newsgroups.target_names[prediction[0]] # Get the category name
    
    return category_name

In [None]:
# Test the function with some new text
test_text = "NASA discovers new exoplanet in the habitable zone."
predicted_category = predict_new_text(model, vectorizer, test_text)
print(f'Test Text: "{test_text}"\nPredicted Category: "{predicted_category}"')

In [None]:
test_texts = [
    "NASA discovers new exoplanet in the habitable zone.",
    "The car race yesterday was amazing.",
    "Python is a versatile programming language for data science.",
    "Pope Francis is the Pope and head of the Catholic Church."
]

for text in test_texts:
    predicted_category = predict_new_text(model, vectorizer, text)
    print(f'Test Text: "{text}"\nPredicted Category: "{predicted_category}"\n')