In [None]:
import pandas as pd

# Load the dataset
df = pd.read_json('/kaggle/input/news-category-dataset/News_Category_Dataset_v3.json', lines=True)

# Displaying the first few rows of the dataset
print(df.head())


In [None]:

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Data Exploration
print("Dataset structure:")
print(df.info())

print("Missing values:")
print(df.isnull().sum())

print("Unique categories:")
print(df['category'].unique())

print("Descriptive statistics:")
print(df.describe())

# Preprocessing
# Handle missing values
df = df.dropna(subset=['headline', 'short_description'])

# Remove duplicates
df = df.drop_duplicates()

# Text Analysis and Feature Extraction
# Tokenization
df['headline_tokens'] = df['headline'].apply(word_tokenize)
df['description_tokens'] = df['short_description'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['headline_tokens'] = df['headline_tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])
df['description_tokens'] = df['description_tokens'].apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])



In [None]:
# Data Visualization
# Category distribution
plt.figure(figsize=(10, 6))
sns.countplot(y='category', data=df)
plt.title('Category Distribution')
plt.xlabel('Count')
plt.ylabel('Category')
plt.show()

# Word cloud of headline words
headline_words = ' '.join(df['headline'].str.lower())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(headline_words)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Headline Words')
plt.show()




In [None]:
# Word cloud of description words
description_words = ' '.join(df['short_description'].str.lower())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(description_words)
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud - Description Words')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix


# Preprocessing
df = df.dropna(subset=['headline', 'short_description'])

# Convert token lists to strings
df['headline_tokens'] = df['headline_tokens'].apply(' '.join)
df['description_tokens'] = df['description_tokens'].apply(' '.join)

# Combine headline and short description into a single text feature
df['text'] = df['headline_tokens'] + ' ' + df['description_tokens']

# Drop duplicates
df = df.drop_duplicates(subset='text')


# Feature Engineering
# Combine headline and short description into a single text feature
df['text'] = df['headline'] + ' ' + df['short_description']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category'], test_size=0.2, random_state=42)

# Feature Extraction
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)



In [None]:
# Modeling - Logistic Regression
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)



In [None]:
# Evaluation
y_pred = model.predict(X_test_vectorized)
print('Classification Report:')
print(classification_report(y_test, y_pred))

print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))


In [None]:
# Interpretation
# Get the most important features (words) for each category
feature_names = vectorizer.get_feature_names_out()
categories = df['category'].unique()

for category in categories:
    category_index = model.classes_.tolist().index(category)
    top_features = model.coef_[category_index].argsort()[-10:][::-1]
    top_words = [feature_names[idx] for idx in top_features]
    print(f'Top words for category "{category}":')
    print(top_words)
    print()