### Part 1: Data Preprocessing:
1.1 Load the dataset and perform initial exploration to understand its structure.

In [34]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('News_Category_Dataset_v3.csv')

print(df.head())

   Unnamed: 0                                           headline   category  \
0           0  Over 4 Million Americans Roll Up Sleeves For O...  U.S. NEWS   
1           1  American Airlines Flyer Charged, Banned For Li...  U.S. NEWS   
2           2  23 Of The Funniest Tweets About Cats And Dogs ...     COMEDY   
3           3  The Funniest Tweets From Parents This Week (Se...  PARENTING   
4           4  Woman Who Called Cops On Black Bird-Watcher Lo...  U.S. NEWS   

                                   short_description               authors  \
0  Health experts said it is too early to predict...  Carla K. Johnson, AP   
1  He was subdued by passengers and crew when he ...        Mary Papenfuss   
2  "Until you have a dog you don't understand wha...         Elyse Wanshel   
3  "Accidentally put grown-up toothpaste on my to...      Caroline Bologna   
4  Amy Cooper accused investment firm Franklin Te...        Nina Golgowski   

         date  headline_length  short_description_length

1.2 Clean the text data, including removing special characters, stopwords, and applying lowercasing.

In [35]:
from nltk.corpus import stopwords
import nltk
import re
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text)
    # lowercase
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    # remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# clean data
df['cleaned_headline'] = df['headline'].apply(clean_text)
df['cleaned_description'] = df['short_description'].apply(clean_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Yue\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


1.3 Perform text tokenization and vectorization using TF-IDF.

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

tfidf_vectorizer = TfidfVectorizer()

tfidf_headline = tfidf_vectorizer.fit_transform(df['cleaned_headline'])
# df_headline_tfidf = pd.DataFrame(tfidf_headline.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_headline_tfidf = df_headline_tfidf.add_prefix('headline_')

tfidf_description = tfidf_vectorizer.fit_transform(df['cleaned_description'])
# df_description_tfidf = pd.DataFrame(tfidf_description.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
# df_description_tfidf = df_description_tfidf.add_prefix('description_')

# df = pd.concat([df, df_headline_tfidf, df_description_tfidf], axis=1)
tfidf= hstack([tfidf_headline,tfidf_description])

1.4 Extract and analyze different features from the text that might be useful for classification, such as word count,
sentence length, n-grams, etc

In [37]:
# change date
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# word count
df['headline_word_count'] = df['headline'].apply(lambda x: len(str(x).split()))
df['description_word_count'] = df['short_description'].apply(lambda x: len(str(x).split()))

# drop extra columns
selected_columns = ['year', 'month', 'day', 'headline_length', 'short_description_length', 'headline_word_count', 'description_word_count' ]
new_df = df[selected_columns].copy()
# combine
data = hstack([csr_matrix(new_df),tfidf])

print(data)

  (0, 0)	2022.0
  (0, 1)	9.0
  (0, 2)	23.0
  (0, 3)	76.0
  (0, 4)	154.0
  (0, 5)	11.0
  (0, 6)	29.0
  (0, 6525)	0.43411106347172085
  (0, 11816)	0.2930364348388803
  (0, 49772)	0.34715627086759526
  (0, 35589)	0.4295348408110416
  (0, 46307)	0.4452883710668971
  (0, 42984)	0.31920292699930125
  (0, 2376)	0.2458235223031094
  (0, 32366)	0.24506760378570813
  (0, 80367)	0.21098028126920687
  (0, 104497)	0.27133950808906415
  (0, 65448)	0.3839295355670428
  (0, 102558)	0.13315756767983997
  (0, 76636)	0.32170408678838597
  (0, 99768)	0.19974144063358404
  (0, 56745)	0.3839295355670428
  (0, 98192)	0.2572541275751765
  (0, 130669)	0.15261260887146255
  (0, 74519)	0.2572541275751765
  :	:
  (209526, 4)	122.0
  (209526, 5)	9.0
  (209526, 6)	19.0
  (209526, 49961)	0.4286941139970831
  (209526, 15815)	0.43451194566248263
  (209526, 23889)	0.46594634185952194
  (209526, 24053)	0.353651167389544
  (209526, 29816)	0.28532827909507463
  (209526, 30338)	0.3184167770560488
  (209526, 42703)	0.320068

<function print>