In [10]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import pymongo
import config

In [11]:
# get data
conn = pymongo.MongoClient(config.DB_HOST, config.DB_PORT)
db = conn[config.DB_NAME]
db.authenticate(config.DB_USER, config.DB_PASS)

fake_news = db.docs.aggregate([{"$limit": db.docs.count_documents({"truth": True})}, {"$match": {"truth": False}}])
real_news = db.docs.find({"truth": True})

In [12]:
real_news = list(real_news)
fake_news = list(fake_news)

In [13]:
real_news.append(fake_news)

In [14]:
news = real_news

In [15]:
X, y = [], []
for x in news:
    if isinstance(x, dict):
        X.append((x['title'], x['text']))
        y.append(x['truth'])    
    else:
        for x2 in x:
            X.append((x2['title'], x2['text']))
            y.append(x2['truth'])

In [16]:
for x in X:
    if not x[0]:
        del y[X.index(x)]
        X.remove(x)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
from ml_dev import preprocess as pre

c = []
for x in X_train:
    c.append((pre.tfidf_preprocess(x[0]), pre.tfidf_preprocess(x[1])))
X_train = c

c = []
for x in X_test:
    c.append((pre.tfidf_preprocess(x[0]), pre.tfidf_preprocess(x[1])))
X_test = c

AttributeError: module 'ml_dev.preprocess' has no attribute 'tfidf_preprocess'

In [None]:
pipe = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(random_state=0))])
pipe.fit(list(x[1] for x in X_train), y_train)
predicted = pipe.predict(list(x[1] for x in X_test))

import numpy as np
np.mean(predicted == y_test)

In [None]:
# cross validate
# c = []
# for x in X:
#     c.append(preprocess.tfidf_preprocess(x[1]))
# X = c
import numpy as np

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import SGDClassifier
skf = StratifiedKFold(n_splits=10)

skf.get_n_splits(X, y)

vals = []
for train_index, test_index in skf.split(X, y):
#     print('{} {}'.format(train_index, test_index))
    X_train = []
    y_train = []
    
    X_test = []
    y_test = []
    
    for ind in train_index:
        X_train.append(X[ind])
        y_train.append(y[ind])
    
    for ind in test_index:
        X_test.append(X[ind])
        y_test.append(y[ind])

    
    pipe = Pipeline([
        ('vect', CountVectorizer()), 
        ('tfidf', TfidfTransformer()), 
        ('sgd', ExtraTreesClassifier(n_estimators=100))
    ])
    pipe.fit(X_train, y_train)
    predicted = pipe.predict(X_test)
    
    print('Score: {}'.format(np.mean(predicted == y_test)))
    vals.append(np.mean(predicted == y_test))
    
print(np.median(vals))