In [2]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load dataset
newsgroups = fetch_20newsgroups(subset='all', categories=['rec.sport.baseball', 'sci.space'], shuffle=True, random_state=42)
data = newsgroups.data
target = newsgroups.target

# Create a DataFrame for easy manipulation
df = pd.DataFrame({'text': data, 'label': target})
df.head()

Unnamed: 0,text,label
0,From: mss@netcom.com (Mark Singer)\nSubject: R...,0
1,From: cuz@chaos.cs.brandeis.edu (Cousin It)\nS...,0
2,From: J019800@LMSC5.IS.LMSC.LOCKHEED.COM\nSubj...,0
3,From: tedward@cs.cornell.edu (Edward [Ted] Fis...,0
4,From: snichols@adobe.com (Sherri Nichols)\nSub...,0


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Transform the text data to feature vectors
X = vectorizer.fit_transform(df['text'])

# Labels
y = df['label']

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

In [6]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = clf.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=newsgroups.target_names)

print(f'Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)

Accuracy: 0.9966
Classification Report:
                    precision    recall  f1-score   support

rec.sport.baseball       0.99      1.00      1.00       286
         sci.space       1.00      0.99      1.00       309

          accuracy                           1.00       595
         macro avg       1.00      1.00      1.00       595
      weighted avg       1.00      1.00      1.00       595



In [7]:
def predict_category(text):
    """
    Predict the category of a given text using the trained classifier.
    """
    text_vec = vectorizer.transform([text])
    prediction = clf.predict(text_vec)
    return newsgroups.target_names[prediction[0]]

# Example usage
sample_text = "NASA announced the discovery of new exoplanets."
predicted_category = predict_category(sample_text)

In [8]:
print(predicted_category)

sci.space
