In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.tokenize import word_tokenize

# Download NLTK resources (run this once)
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajwa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajwa\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [2]:
def preprocess_text(text):
    # Tokenization
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]
    
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    
    # Reassemble the preprocessed text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [None]:
import json

# Read data from JSON file
with open('output.json', 'r') as file:
    data = json.load(file)

# Assuming your JSON file has the same structure as the provided 'data' variable
df = pd.DataFrame(data)

# Preprocess the text data
df['UserInput_Preprocessed'] = df['UserInput'].apply(preprocess_text)
df['BotResponse_Preprocessed'] = df['BotResponse'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_user_input = vectorizer.fit_transform(df['UserInput_Preprocessed'])
X_bot_response = vectorizer.transform(df['BotResponse_Preprocessed'])

# Display the preprocessed data and TF-IDF vectors
print("Preprocessed Data:")
print(df[['UserInput_Preprocessed', 'BotResponse_Preprocessed']])
print("\nTF-IDF Vectors for User Input:")
print(X_user_input.toarray())
print("\nTF-IDF Vectors for Bot Response:")
print(X_bot_response.toarray())
