In [26]:
import json
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('stopwords')

# Function to read first 100 entries from a JSON file
def read_data(filename, num_entries=100):
    with open(filename, 'r') as file:
        data = [json.loads(next(file)) for _ in range(num_entries)]
    return data

# Function to preprocess the data
def preprocess_data(data):
    # Iterate through each entry
    for entry in data:
        # Text preprocessing on description and title
        entry['description'] = preprocess_text(entry.get('description', ''))
        entry['title'] = preprocess_text(entry.get('title', ''))
    return data

def preprocess_text(text):
    if isinstance(text, list):
        # Join the list elements into a single string
        text = ' '.join(text)
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub('<[^>]*>', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    # Join tokens back into a string
    preprocessed_text = ' '.join(stemmed_tokens)
    return preprocessed_text

# Function to write data to a JSON file
def write_data(data, filename):
    with open(filename, 'w') as file:
        json.dump(data, file, indent=4)

# Main function
def main():
    # Read first 100 entries from data.json file
    data = read_data('data.json', num_entries=100)

    # Preprocess the data
    preprocessed_data = preprocess_data(data)

    # Write preprocessed data to preprocess.json file
    write_data(preprocessed_data, 'preprocess.json')

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /home/muhammad/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/muhammad/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
import json

# Function to count the number of entries in a JSON file
def count_entries(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        num_entries = len(data)
    return num_entries

# Example usage:
num_entries = count_entries('preprocess.json')
print("Number of entries:", num_entries)


Number of entries: 100
