In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset
categories = ['sci.space', 'rec.sport.hockey', 'comp.graphics', 'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Convert to DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})

# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)

# Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=newsgroups.target_names)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\USER/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\USER/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.8755020080321285
Classification Report:
                    precision    recall  f1-score   support

     comp.graphics       0.92      0.88      0.90       209
  rec.sport.hockey       0.93      0.91      0.92       171
         sci.space       0.80      0.88      0.84       202
talk.politics.misc       0.86      0.83      0.85       165

          accuracy                           0.88       747
         macro avg       0.88      0.87      0.88       747
      weighted avg       0.88      0.88      0.88       747



In [3]:
# Load dataset
# categories = ['sci.space', 'rec.sport.hockey', 'comp.graphics', 'talk.politics.misc']
newsgroups = fetch_20newsgroups()

# Convert to DataFrame
df = pd.DataFrame({'text': newsgroups.data, 'label': newsgroups.target})
df.head()


Unnamed: 0,text,label
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14


In [5]:
# Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

df['cleaned_text'] = df['text'].apply(preprocess_text)


In [6]:
df['cleaned_text']

0        lerxstwamumdedu wheres thing subject car nntpp...
1        guykuocarsonuwashingtonedu guy kuo subject si ...
2        twillisececnpurdueedu thomas e willis subject ...
3        jgreenamber joe green subject weitek p organiz...
4        jcmheadcfaharvardedu jonathan mcdowell subject...
                               ...                        
11309    jimzisfeinfactorycom jim zisfein subject migra...
11310    ebodinpearltuftsedu subject screen death mac p...
11311    westesnetcomcom estes subject mounting cpu coo...
11312    stevehcrlgw steven collins subject sphere poin...
11313    gunningccocaltechedu kevin j gunning subject s...
Name: cleaned_text, Length: 11314, dtype: object

In [7]:
# Feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['cleaned_text'])
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=newsgroups.target_names)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.8762704374723818
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.89      0.88      0.88        97
           comp.graphics       0.76      0.88      0.82       104
 comp.os.ms-windows.misc       0.78      0.78      0.78       115
comp.sys.ibm.pc.hardware       0.70      0.77      0.74       123
   comp.sys.mac.hardware       0.89      0.79      0.84       126
          comp.windows.x       0.83      0.85      0.84       106
            misc.forsale       0.72      0.80      0.76       109
               rec.autos       0.89      0.89      0.89       139
         rec.motorcycles       0.93      0.91      0.92       122
      rec.sport.baseball       0.94      0.98      0.96       102
        rec.sport.hockey       0.98      0.94      0.96       108
               sci.crypt       0.99      0.95      0.97       125
         sci.electronics       0.83      0.81      0.82       114
                 sci.me