### Part 1: Data Preprocessing:
1.1 Load the dataset and perform initial exploration to understand its structure.

In [1]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('News_Category_Dataset_v3.csv')

print(df.head())

   Unnamed: 0                                           headline   category  \
0           0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1           1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2           2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3           3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4           4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog you don't understand wha...         Elyse Wanshel   
3  "Accidentally put grown-up toothpaste on my to...      Caroline Bologna   
4  Amy Cooper accused investment firm Franklin Te...        Nina Golgowski   

         date  headline_length  short_description_length

1.2 Clean the text data, including removing special characters, stopwords, applying lowercasing, correcting spelling, standardizing, handling contractions, and lemtization.

In [2]:
from nltk.corpus import stopwords
import nltk
import re
from symspellpy import SymSpell, Verbosity
from nltk.stem import WordNetLemmatizer
import pkg_resources
import inflect
import contractions
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
lemmatizer = WordNetLemmatizer()
p = inflect.engine()

def standardize_numbers(text):
    return ' '.join([p.number_to_words(word) if word.isdigit() else word for word in text.split()])

def handle_contractions(text):
    return contractions.fix(text)

def clean_text(text):
    text = str(text)
    # lowercase
    text = text.lower()
    # standardize
    text = standardize_numbers(text)
    # handle contractions
    text = handle_contractions(text)
    # correct typos
    words = text.split()
    corrected_words = []
    for word in words:
        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)
        corrected_words.append(suggestions[0].term if suggestions else word)
    text = ' '.join(corrected_words)
    # remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    # lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    # rejoin words
    text = ' '.join(words)
    return text

# clean data
df['cleaned_headline'] = df['headline'].apply(clean_text)
df['cleaned_description'] = df['short_description'].apply(clean_text)

  import pkg_resources
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/caoyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/caoyun/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


1.3 Perform text tokenization and vectorization using TF-IDF.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf_vectorizer = TfidfVectorizer()

tfidf_headline = tfidf_vectorizer.fit_transform(df['cleaned_headline'])
# df_headline_tfidf = pd.DataFrame(tfidf_headline.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_headline_tfidf = df_headline_tfidf.add_prefix('headline_')

tfidf_description = tfidf_vectorizer.fit_transform(df['cleaned_description'])
# df_description_tfidf = pd.DataFrame(tfidf_description.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_description_tfidf = df_description_tfidf.add_prefix('description_')

# df = pd.concat([df, df_headline_tfidf, df_description_tfidf], axis=1)
tfidf= hstack([tfidf_headline,tfidf_description])

1.4 Extract and analyze different features from the text that might be useful for classification, such as word count,
sentence length, n-grams, etc

In [4]:
#pip install category_encoders

from category_encoders import BinaryEncoder

# change date
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# word count
df['headline_word_count'] = df['headline'].apply(lambda x: len(str(x).split()))
df['description_word_count'] = df['short_description'].apply(lambda x: len(str(x).split()))

# encode authors using Binary encoding
encoder = BinaryEncoder(cols=['authors'], return_df=True)
df_encoded = encoder.fit_transform(df['authors'])
df_encoded_sparse = csr_matrix(df_encoded.values)

# drop extra columns
selected_columns = ['year', 'month', 'day', 'headline_length', 'short_description_length', 'headline_word_count', 'description_word_count' ]
new_df = df[selected_columns].copy()
# combine
data = hstack([csr_matrix(new_df), df_encoded_sparse,tfidf])

print(data)

ModuleNotFoundError: No module named 'category_encoders'