In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pymongo
import config

In [21]:
# get data
conn = pymongo.MongoClient(config.DB_HOST, config.DB_PORT)
db = conn[config.DB_NAME]
db.authenticate(config.DB_USER, config.DB_PASS)

fake_news = db.docs.aggregate([{"$limit": db.docs.count_documents({"truth": True})}, {"$match": {"truth": False}}])
real_news = db.docs.find({"truth": True})

In [22]:
real_news = list(real_news)
fake_news = list(fake_news)

In [23]:
real_news.append(fake_news)

In [24]:
news = real_news

In [25]:
X, y = [], []
for x in news:
    if isinstance(x, dict):
        X.append((x['title'], x['text']))
        y.append(x['truth'])    
    else:
        for x2 in x:
            X.append((x2['title'], x2['text']))
            y.append(x2['truth'])

In [26]:
for x in X:
    if not x[0]:
        del y[X.index(x)]
        X.remove(x)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [28]:
import preprocess as pre

c = []
for x in X_train:
    c.append((pre.tfidf_preprocess(x[0]), pre.tfidf_preprocess(x[1])))
X_train = c

c = []
for x in X_test:
    c.append((pre.tfidf_preprocess(x[0]), pre.tfidf_preprocess(x[1])))
X_test = c

In [29]:
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(random_state=0))])
pipe.fit(list(x[1] for x in X_train), y_train)
predicted = pipe.predict(list(x[1] for x in X_test))

import numpy as np
np.mean(predicted == y_test)

0.8063583815028902

In [31]:
# cross validate
c = []
for x in X:
    import preprocess
    c.append(preprocess.tfidf_preprocess(x[1]))
X = c

import numpy as np

from sklearn.model_selection import StratifiedKFold
# from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier

skf = StratifiedKFold(n_splits=40)

skf.get_n_splits(X, y)

vals = []
for train_index, test_index in skf.split(X, y):
#     print('{} {}'.format(train_index, test_index))
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    for ind in train_index:
        X_train.append(X[ind])
        y_train.append(y[ind])
    
    for ind in test_index:
        X_test.append(X[ind])
        y_test.append(y[ind])

    
    pipe = Pipeline([
        ('vect', CountVectorizer()), 
        ('tfidf', TfidfTransformer()), 
        ('sgd', ExtraTreesClassifier(n_estimators=100))
    ])
    pipe.fit(X_train, y_train)
    predicted = pipe.predict(X_test)
    
    print('Score: {}'.format(np.mean(predicted == y_test)))
    vals.append(np.mean(predicted == y_test))
    
print(np.mean(vals))

Score: 0.8181818181818182
Score: 0.8181818181818182
Score: 0.7045454545454546
Score: 0.8636363636363636
Score: 0.7954545454545454
Score: 0.7727272727272727
Score: 0.9772727272727273
Score: 0.7727272727272727
Score: 0.9318181818181818
Score: 0.9545454545454546
Score: 0.8409090909090909
Score: 0.7727272727272727
Score: 0.9090909090909091
Score: 1.0
Score: 0.9318181818181818
Score: 0.9318181818181818
Score: 0.9302325581395349
Score: 0.9302325581395349
Score: 0.9302325581395349
Score: 0.9302325581395349
Score: 0.6976744186046512
Score: 0.7906976744186046
Score: 0.6976744186046512
Score: 0.6744186046511628
Score: 0.7674418604651163
Score: 0.7906976744186046
Score: 0.6976744186046512
Score: 0.813953488372093
Score: 0.9069767441860465
Score: 0.8604651162790697
Score: 0.6976744186046512
Score: 0.6904761904761905
Score: 0.8095238095238095
Score: 0.7857142857142857
Score: 0.7857142857142857
Score: 0.7380952380952381
Score: 0.8571428571428571
Score: 0.7142857142857143
Score: 0.7142857142857143
Sc

In [None]:
# xgboost test
import numpy as np
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier

skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)

vals = []
for train_index, test_index in skf.split(X, y):
#     print('{} {}'.format(train_index, test_index))
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    for ind in train_index:
        X_train.append(X[ind])
        y_train.append(y[ind])
    
    for ind in test_index:
        X_test.append(X[ind])
        y_test.append(y[ind])

    
    pipe = Pipeline([
        ('vect', CountVectorizer()), 
        ('tfidf', TfidfTransformer()), 
        ('sgd', XGBClassifier(n_estimators=100))
    ])
    pipe.fit(X_train, y_train)
    predicted = pipe.predict(X_test)
    
    print('Score: {}'.format(np.mean(predicted == y_test)))
    vals.append(np.mean(predicted == y_test))
    
print(np.mean(vals))