In [10]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import string
import nltk

# Load the dataset (considering the only sheet)
file_path = "C:/Users/basha/Downloads/NLP Data Set - Dataset 2.csv"
df = pd.read_csv(file_path, skiprows=1)  # Skip the first row as it contains irrelevant information

# Display the structure of the dataset
print("Dataset Structure:")
print(df.info())

# Handle missing values and duplicates
df = df.dropna()  # Drop rows with missing values
df = df.drop_duplicates()  # Drop duplicate rows

# Explore the distribution of different categories
category_distribution = df['Category'].value_counts()
print("\nCategory Distribution:")
print(category_distribution)

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Text Cleaning and Tokenization
def clean_and_tokenize(text):
    # Remove special characters and symbols
    text = ''.join([char for char in text if char not in string.punctuation])
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    return tokens

# Apply cleaning and tokenization to the 'News Text' column
df['Cleaned Tokens'] = df['News Text'].apply(clean_and_tokenize)

# Display the cleaned and tokenized data
print("\nCleaned and Tokenized Data:")
print(df[['News Text', 'Cleaned Tokens']])

# Convert the tokenized text into numerical vectors using TF-IDF transformation
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['Cleaned Tokens'].apply(lambda x: ' '.join(x)))

# Display the TF-IDF matrix
print("\nTF-IDF Matrix:")
print(X_tfidf)

# Model Selection and Training
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['Category'], test_size=0.2, random_state=42)

# Choose Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Model Evaluation
y_pred = nb_classifier.predict(X_test)

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("\nModel Evaluation:")
print(f"Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)

# Analyze the most important features (words) using TF-IDF values
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the feature importance (TF-IDF values) for each category
category_feature_importance = {}
for category_idx, category in enumerate(nb_classifier.classes_):
    feature_importance = nb_classifier.feature_log_prob_[category_idx]
    category_feature_importance[category] = dict(zip(feature_names, feature_importance))

# Display feature importance for each category
print("\nFeature Importance for Each Category:")
for category, features in category_feature_importance.items():
    top_features = sorted(features.items(), key=lambda x: x[1], reverse=True)[:5]
    print(f"\nCategory: {category}")
    for feature, importance in top_features:
        print(f"{feature}: {importance:.2f}")


Dataset Structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   News Text  10 non-null     object
 1   Category   10 non-null     object
dtypes: object(2)
memory usage: 292.0+ bytes
None

Category Distribution:
Category
Technology       1
Politics         1
Sports           1
Health           1
Business         1
Entertainment    1
Environment      1
Lifestyle        1
Science          1
Travel           1
Name: count, dtype: int64

Cleaned and Tokenized Data:
                                           News Text  \
0                  New technology advancements in AI   
1  Political leaders discuss climate change policies   
2             Exciting game in the sports tournament   
3               Health tips for a balanced lifestyle   
4                Stock market trends and predictions   
5       Hollywood actors share their latest projects   
6   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\basha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\basha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
