In [None]:
#Project 2
#Casey Cruz
import json
#import time
from tabulate import tabulate
#for stratified k fold cross validation
from sklearn.model_selection import StratifiedKFold
#for lemmatization/stopwords
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#can comment this out after first time downloading
nltk.download('wordnet')
nltk.download('stopwords')


#create list of stop words
stop_words = set(stopwords.words('english'))

#set up lemmatization object
lemmatizer = WordNetLemmatizer()
    
#function to remove stop words and numbers
def cleanData(text):
    words = text.split()
    clean_Words = []
    for word in words:
        #remove words and numbers
        if word.lower() not in stop_words and not word.isdigit():
            #lemmatize individual words
            clean_Words.append(lemmatizer.lemmatize(word))
    return ' '.join(clean_Words)
#.join() adds each word to list 1 by 1 with space inbetween each

#list to store json data
json_data = []

#open & read json
#add each json to data list and clean
with open('News_Category_DataSet_v3.json', 'r') as file:
    # Parse json data
    for line in file:
        item = json.loads(line)
        item['headline'] = cleanData(item['headline'])
        item['short_description'] = cleanData(item['short_description'])
        json_data.append(item)

#LaPlace smoothing
#count each word for each category
#if a word is in one category add it to each other category
def laplaceSmoothing(training_data):
    wordCounts = {}  # dictionary to store word counts
    categoryCounts = {}  # dictionary to store category counts

    #iterate through data
    for data in training_data:
        category = data["category"]
        headline = cleanData(data["headline"])
        short_des = cleanData(data["short_description"])

        #look at each word from headline/description individually
        for word in headline.split() + short_des.split():
            #if the word is new initialize it & its categrory w/ 1
            if word not in wordCounts:
                wordCounts[word] = {category: 1}
            else:
                #if word is not new check if in current category
                if category in wordCounts[word]:
                    #incremnt if in current category
                    wordCounts[word][category] += 1
                else:
                    #if not in current category but new initialize w/ 1
                    wordCounts[word][category] = 1
            #check category
            if category in categoryCounts:
                #if present increment count
                categoryCounts[category] += 1
            else:
                #if new initialize w/ 1
                categoryCounts[category] = 1

    wordProbabilities = {}  # dictionary to store word probabilities
    #.keys() returns object of all dictionary keys
    categories = list(categoryCounts.keys())

    ###
    ###
    #calcualte word probabilities for each word appearing in each category
    for word in wordCounts:
        wordProbabilities[word] = {}
        for category in categories:
            if category in wordCounts[word]:
                #probability for exisiting words. +1 is just for safety
                #divide word couny by the total # of words in a category and in the overall number of words
                wordProbabilities[word][category] = (wordCounts[word][category] + 1) / (categoryCounts[category] + len(wordCounts))
            else:
                #smoothing
                #non exisiting words add 1
                wordProbabilities[word][category] = 1 / (categoryCounts[category] + len(wordCounts))

    return wordProbabilities

#use the word count for each word to caluclate the probabilty 
#that a title & description belong to a certain category.
def categoryProb(article, wordProbabilities):
    #get individual words from headline/description
    words = article['headline'].split() + article['short_description'].split()
    
    #store category probabilities
    categoryProbabilities = {}
    
    #get the categories from the word_probabilities
    #iterate over keys of word_probabilities, skip inital key which is just a word
    categories = wordProbabilities[next(iter(wordProbabilities))]

    #look through each category
    for category in categories:
        #initialize probability
        categoryProbabilities[category] = 1.0

        #calculate the probabilty the words appearing in a category
        for word in words:
            if word in wordProbabilities:
                categoryProbabilities[category] *= wordProbabilities[word][category]       
   
    return categoryProbabilities

#find the highest probabilty & assign a category to the title & description
#compare the probabilities
def assignCategory(testData, wordProb):
    #list to store predictions
    predictions = []
    
    for article in testData:
        categoryProbabilities = categoryProb(article, wordProb)
        # Find the category with the highest probability
        #.get retrieves values associated with the keys 
        predictedCategory = max(categoryProbabilities, key=categoryProbabilities.get)
        predictions.append(predictedCategory)

    return predictions

#calculate the accuracy of the probability
def calculate_accuracy(predictions, testData):
    correct = 0
    #iterate over predictions and check against original list
    for i in range(len(predictions)):
        if predictions[i] == testData[i]["category"]:
            correct += 1
    accuracy = correct / len(predictions)
    return accuracy

#now do cross validation
#numfolds for cross-validation
numFolds = 5

#perform cross validation, implement object
stratKfold = StratifiedKFold(n_splits=numFolds, shuffle=True, random_state=1)

#store each folds accuracy
accuracy_scores = []

#look through each fold, take in data to split and target class (article categories)
for trainIndex, testIndex in stratKfold.split(json_data, [item['category'] for item in json_data]):
    #get training data for current fold
    trainingData = [json_data[i] for i in trainIndex]
    #get testing data for current fold
    testData = [json_data[i] for i in testIndex]
    
    #calculate word probabilities using laplace smoothing
    wordProbabilities = laplaceSmoothing(trainingData)

    #assign categories to test data
    predictions = assignCategory(testData, wordProbabilities)

    #calculate accuracy
    individualAccuracy = calculate_accuracy(predictions, testData) * 100
    accuracy_scores.append(individualAccuracy)
    
avgAccuracy = sum(accuracy_scores) / numFolds
print(accuracy_scores)
print(f"Average Accuracy (Cross-Validation): {avgAccuracy:.2f}%")

#create a table to compare actual vs. predicted categories
table = []

#implementing tabulate
for i in range(len(testData)):
    actualCategory = testData[i]["category"]
    predictedCategory = predictions[i]
    table.append([actualCategory, predictedCategory])

    
print("\nActual vs. Predicted Categories:")
print(tabulate(table, headers=["Actual Category", "Predicted Category"]))

[nltk_data] Downloading package wordnet to /Users/casey/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/casey/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
