# Data Collection Part

In [20]:
import numpy as np  
import pandas as pd
from sklearn.datasets import fetch_20newsgroups  

In [22]:
# Load full 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
documents = newsgroups.data
print("Total Documents Loaded:", len(documents))

Total Documents Loaded: 18846


In [23]:
categories = newsgroups.target_names
labels = newsgroups.target  # Numerical category labels

In [24]:
# Create a DataFrame
df = pd.DataFrame({'Category': [categories[label] for label in labels], 'Text': documents})

In [25]:
# Save to CSV file
df.to_csv('20newsgroups_dataset.csv', index=False, encoding='utf-8')
print("Dataset saved as 20newsgroups_dataset.csv")

Dataset saved as 20newsgroups_dataset.csv


In [26]:
df

Unnamed: 0,Category,Text
0,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...
1,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...
2,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...
3,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...
...,...,...
18841,sci.med,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...
18842,sci.electronics,\nNot in isolated ground recepticles (usually ...
18843,comp.sys.ibm.pc.hardware,I just installed a DX2-66 CPU in a clone mothe...
18844,comp.graphics,\nWouldn't this require a hyper-sphere. In 3-...


# Step 2: Data Preparation & Cleaning

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18846 entries, 0 to 18845
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  18846 non-null  object
 1   Text      18846 non-null  object
dtypes: object(2)
memory usage: 294.6+ KB


In [45]:
df.describe()

Unnamed: 0,Category,Text
count,18846,18846.0
unique,20,18287.0
top,rec.sport.hockey,
freq,999,380.0


In [43]:
df.isna().sum()

Category    0
Text        0
dtype: int64

In [55]:
# Drop rows where 'text' is NaN
df.dropna(subset=['Text'], inplace=True)

In [57]:
# Check for null values again
df.isnull().sum()

Category    0
Text        0
dtype: int64

In [59]:
df

Unnamed: 0,Category,Text
0,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...
1,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...
2,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...
3,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...
4,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...
...,...,...
18841,sci.med,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...
18842,sci.electronics,\nNot in isolated ground recepticles (usually ...
18843,comp.sys.ibm.pc.hardware,I just installed a DX2-66 CPU in a clone mothe...
18844,comp.graphics,\nWouldn't this require a hyper-sphere. In 3-...


In [102]:
# Import required libraries
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [67]:
# Define text cleaning function
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)                 # Remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text)            # Remove special characters/digits
    text = text.lower()                               # Convert to lowercase
    words = text.split()
    words = [w for w in words if w not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]  # Lemmatization
    return ' '.join(words)

In [71]:
# Apply the clean_text function to 'text' column
df['clean_text'] = df['Text'].apply(clean_text)

In [110]:
# Show cleaned data
df[['clean_text', 'Category']].head()

Unnamed: 0,clean_text,Category
0,sure bashers pen fan pretty confused lack kind...,rec.sport.hockey
1,brother market high performance video card sup...,comp.sys.ibm.pc.hardware
2,finally said dream mediterranean new area grea...,talk.politics.mideast
3,think scsi card dma transfer disk scsi card dm...,comp.sys.ibm.pc.hardware
4,old jasmine drive cannot use new system unders...,comp.sys.mac.hardware


In [85]:
# now, the data is clean 
df

Unnamed: 0,Category,Text,clean_text
0,rec.sport.hockey,\n\nI am sure some bashers of Pens fans are pr...,sure bashers pen fan pretty confused lack kind...
1,comp.sys.ibm.pc.hardware,My brother is in the market for a high-perform...,brother market high performance video card sup...
2,talk.politics.mideast,\n\n\n\n\tFinally you said what you dream abou...,finally said dream mediterranean new area grea...
3,comp.sys.ibm.pc.hardware,\nThink!\n\nIt's the SCSI card doing the DMA t...,think scsi card dma transfer disk scsi card dm...
4,comp.sys.mac.hardware,1) I have an old Jasmine drive which I cann...,old jasmine drive cannot use new system unders...
...,...,...,...
18841,sci.med,DN> From: nyeda@cnsvax.uwec.edu (David Nye)\nD...,dn nyeda cnsvax uwec edu david nye dn neurolog...
18842,sci.electronics,\nNot in isolated ground recepticles (usually ...,isolated ground recepticles usually unusual co...
18843,comp.sys.ibm.pc.hardware,I just installed a DX2-66 CPU in a clone mothe...,installed dx cpu clone motherboard tried mount...
18844,comp.graphics,\nWouldn't this require a hyper-sphere. In 3-...,require hyper sphere space point specifies sph...


In [88]:
df.drop(columns = 'Text', inplace = True)

In [90]:
df

Unnamed: 0,Category,clean_text
0,rec.sport.hockey,sure bashers pen fan pretty confused lack kind...
1,comp.sys.ibm.pc.hardware,brother market high performance video card sup...
2,talk.politics.mideast,finally said dream mediterranean new area grea...
3,comp.sys.ibm.pc.hardware,think scsi card dma transfer disk scsi card dm...
4,comp.sys.mac.hardware,old jasmine drive cannot use new system unders...
...,...,...
18841,sci.med,dn nyeda cnsvax uwec edu david nye dn neurolog...
18842,sci.electronics,isolated ground recepticles usually unusual co...
18843,comp.sys.ibm.pc.hardware,installed dx cpu clone motherboard tried mount...
18844,comp.graphics,require hyper sphere space point specifies sph...


# Step 3: Vectorization & EDA