In [1]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

# Load the dataset
newsgroups = fetch_20newsgroups(subset='all', shuffle=True, random_state=42, remove=('headers', 'footers', 'quotes'))
df = pd.DataFrame({'text': newsgroups.data, 'target': newsgroups.target})

# Display the first few rows of the dataset
df.head()


Unnamed: 0,text,target
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,\n\n\n\n\tFinally you said what you dream abou...,17
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3
4,1) I have an old Jasmine drive which I cann...,4


In [7]:
df['text'][0]

'sure basher pen fan pretti confus lack kind post recent pen massacr devil actual bit puzzl bit reliev howev go put end nonpittsburgh relief bit prais pen man kill devil wors thought jagr show much better regular season stat also lot fo fun watch playoff bowman let jagr lot fun next coupl game sinc pen go beat pulp jersey anyway disappoint see island lose final regular season game pen rule'

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Stem the tokens
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

df['text'] = df['text'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zabih\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zabih\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Zabih\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text'])
y = df['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

# Train Decision Tree classifier
dt_classifier = DecisionTreeClassifier()
dt_classifier.fit(X_train, y_train)

# Evaluate classifiers
y_pred_nb = nb_classifier.predict(X_test)
y_pred_dt = dt_classifier.predict(X_test)

print("Naive Bayes Classifier Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Decision Tree Classifier Accuracy:", accuracy_score(y_test, y_pred_dt))


Naive Bayes Classifier Accuracy: 0.6676392572944297
Decision Tree Classifier Accuracy: 0.4679045092838196


In [5]:
def chatbot_response(user_input):
    user_input_processed = preprocess_text(user_input)
    user_input_vectorized = vectorizer.transform([user_input_processed])
    
    response_nb = nb_classifier.predict(user_input_vectorized)
    response_dt = dt_classifier.predict(user_input_vectorized)
    
    response_map = {i: newsgroups.target_names[i] for i in range(len(newsgroups.target_names))}
    
    response = f"Naive Bayes: {response_map[response_nb[0]]}, Decision Tree: {response_map[response_dt[0]]}"
    return response


In [8]:
user_input = 'sure basher pen fan pretti confus lack kind post recent pen massacr devil actual bit puzzl bit reliev howev go put end nonpittsburgh relief bit prais pen man kill devil wors thought jagr show much better regular season stat also lot fo fun watch playoff bowman let jagr lot fun next coupl game sinc pen go beat pulp jersey anyway disappoint see island lose final regular season game pen rule'
chatbot_response(user_input)

'Naive Bayes: rec.sport.hockey, Decision Tree: rec.sport.hockey'

# Save the models and vectors into the pickle files

In [9]:
# Save the models and vectors into the pickle files
import pickle

pickle.dump(nb_classifier, open('nb_classifier.pkl', 'wb'))

pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

# Load the models and vectors from the pickle files

nb_classifier = pickle.load(open('nb_classifier.pkl', 'rb'))

vectorizer = pickle.load(open('vectorizer.pkl', 'rb'))

# Use the models and vectors to make predictions