# Predict difficulty of news articles

In [1]:
import pymongo
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Connect to MONGO DB

In [2]:
myclient = pymongo.MongoClient()
mydb = myclient["finalProject"]
main = mydb["main_processed_article"]

# Load the logistic regression model and the tfidf vectorizer

In [3]:
pickle_in = open("logistic_regression.pickle","rb")
model = pickle.load(pickle_in)

In [4]:
pickle_in = open("tfidf_vectorizer.pickle","rb")
tfidf = pickle.load(pickle_in)

# Iterate through the cursor, adding the predictions for each article

In [5]:
cursor = main.find({})

In [6]:
for article in cursor:
    
    # First check if the article has already been classified
    myquery = { "_id": article['_id'] }
    
    already_classified = list(main.find(myquery, {"_id": 0, "level_binary": 1}))
    article_class = already_classified[0]['level_binary']
    
    # If it has not been classified yet then make a prediction and upate the corresponding field
    if not article_class:
    
        # Get the bag of words and make a prediction
        bag_of_words = article['bag_of_words']
        vectors = tfidf.transform(bag_of_words)
        article_prediction = model.predict(vectors)
        article_prediction = int(article_prediction[0])
        
        # See: https://www.w3schools.com/python/python_mongodb_update.asp
        newvalues = { "$set": { "level_binary": article_prediction } }
        main.update_one(myquery, newvalues)
    else:
        print("Article already classified")