In [17]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load dataset with selected categories
categories = ['rec.sport.baseball', 'rec.sport.hockey', 'sci.med', 'sci.space']  # Fewer categories for simplicity
newsgroups_data = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))

# Create a DataFrame for easier handling
data = pd.DataFrame({'text': newsgroups_data.data, 'category': newsgroups_data.target})
data['category_name'] = data['category'].apply(lambda x: newsgroups_data.target_names[x])

print('Show a preview')
display(data.head())

Show a preview


Unnamed: 0,text,category,category_name
0,\nA freeze dried Tootsie Roll (tm). The actua...,3,sci.space
1,\n\n\n Hmmm...what about walks and SB? Baerga ...,0,rec.sport.baseball
2,: >\n: >ATLANTIC DIVISION\n: >\t\n: >\tST JOHN...,1,rec.sport.hockey
3,\n\n INTENSIVE JAPANESE AT THE UNIVERSITY O...,2,sci.med
4,\n\n\n\n\nWhen I was at the Texas Star Party a...,3,sci.space


In [None]:
# Download necessary resources for preprocessing
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /Users/arifmoazy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/arifmoazy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/arifmoazy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
from nltk.corpus import stopwords
print(stopwords.words('english')[:10])  # Test if stopwords are accessible

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [33]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back to a single string
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Show a preview of the cleaned text
display(data[['text', 'cleaned_text']].head())


Unnamed: 0,text,cleaned_text
0,\nA freeze dried Tootsie Roll (tm). The actua...,freeze dried tootsie roll tm actual taste sens...
1,\n\n\n Hmmm...what about walks and SB? Baerga ...,hmmmwhat walk sb baerga got clobbered alomar o...
2,: >\n: >ATLANTIC DIVISION\n: >\t\n: >\tST JOHN...,atlantic division st john maple leaf v moncton...
3,\n\n INTENSIVE JAPANESE AT THE UNIVERSITY O...,intensive japanese university pittsburgh summe...
4,\n\n\n\n\nWhen I was at the Texas Star Party a...,texas star party year ago sky dark venus indee...


In [34]:
# saving the cleaned data in a csv file

data[['cleaned_text', 'category_name']].to_csv('../dataset/cleaned_newsgroups_data.csv', index=False)