In [4]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import pymongo
import nltk
nltk.download('stopwords')
nltk.download('words')
nltk.download('punkt')
from ml_dev import preprocess

def pull_train_data(db):
    fake_news = db.docs.aggregate([{"$limit": db.docs.count_documents({"truth": True})}, {"$match": {"truth": False}}])
    real_news = db.docs.find({"truth": True})
    
    news = list(real_news)
    news.append(list(fake_news))
    return news

def train_model(db):
    data = pull_train_data(db)
    X = []
    y = []

    # preprocess
    for d in data:
        if isinstance(d, dict):
            X.append(preprocess.tfidf_preprocess(d['text']))
            y.append(d['truth'])
        else:
            for i in d:
                X.append(preprocess.tfidf_preprocess(i['text']))
                y.append(i['truth'])
    
    pipe = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('xgb', XGBClassifier())
    ])

    pipe.fit(X, y)
    
    return pipe

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/aviboppana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/aviboppana/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /home/aviboppana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import config

connection = pymongo.MongoClient(config.DB_HOST, config.DB_PORT)
db = connection[config.DB_NAME]
db.authenticate(config.DB_USER, config.DB_PASS)

model = train_model(db)


<h1> News Classifier </h1>

In [None]:
from analyze_stylistics import get_from_pickle
from diffbot import retrieve_from_url
import config
from diffbot import *

def load_model():
    model = get_from_pickle("xboost.pickle")
    return model

def assess(url, model):
    data = retrieve_from_url(url, get_token())
    if not data:
        return 'Not a valid article'
    if model.predict([data])[0]:
        return 'Real'
    else:
        return 'Fake'
   

if __name__ == "__main__":
    while(True):
        model = load_model()
        url = input("Enter URL: ")
        result = assess(url, model)
        print(result)

Enter URL: https://abcnews.go.com/Politics/wireStory/bredesen-seeks-votes-women-tennessee-senate-race-57698079?cid=clicksource_77_2_hero_headlines_headlines_hed


  if diff:


Real
Enter URL: https://patriothole.clickhole.com/liberal-hypocrisy-when-obama-was-president-democrats-w-1828462533


  if diff:


Fake
Enter URL: https://www.cnn.com/2018/09/04/politics/bob-woodward-book-donald-trump-fear/index.html


  if diff:


Real
Enter URL: https://www.newyorker.com/humor/borowitz-report/white-man-hopes-to-land-job-without-background-check


  if diff:


Fake


https://abcnews.go.com/Politics/wireStory/bredesen-seeks-votes-women-tennessee-senate-race-57698079?cid=clicksource_77_2_hero_headlines_headlines_hed

https://patriothole.clickhole.com/liberal-hypocrisy-when-obama-was-president-democrats-w-1828462533

https://www.cnn.com/2018/09/04/politics/bob-woodward-book-donald-trump-fear/index.html

https://www.newyorker.com/humor/borowitz-report/white-man-hopes-to-land-job-without-background-check