In [121]:
import pymongo
from pymongo import MongoClient

import numpy as np
from scipy import sparse

import getpass
import base64

import xgboost as xgb

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize, LabelEncoder
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, log_loss, f1_score
from sklearn.base import BaseEstimator

# connecting to the database

In [2]:
client = MongoClient('localhost:27017')
db = client.arXivDB
db.users.count()

7

In [48]:
def cleaner(doc, stem=False):
    '''Function to clean the text data and prep for further analysis'''
    doc = doc.lower() # turn text to lowercase

    stops = set(stopwords.words("english"))       # Creating a set of Stopwords
    p_stemmer = PorterStemmer()                   # Creating the stemmer model

    doc = re.sub(r"quantum", '', doc)           # removing the word quantum (duh)
    doc = re.sub(r"physics", '', doc)           # removing the word physics (duh)
    doc = re.sub(r"state", '', doc)           # removing the word state (duh)
    doc = re.sub(r'\$.*?\$', 'latexinlineformula', doc) # replacing latex inline formula
    doc = re.sub(r'\\n', ' ', doc) # removing new line character
    doc = re.sub(r'\\\\\"', '', doc)             # removing german double dotted letters
    doc = re.sub(r"</?\w+[^>]*>", '', doc)      # removing html tags
    doc = re.sub("[^a-zA-Z]", ' ', doc)    # removing anythin other alpha-numerical char's and @ and !

    doc = doc.split()                          # Splits the data into individual words 
    doc = [w for w in doc if not w in stops and len(w) > 3]   # Removes stopwords and short length words
    if stem:
        doc = [p_stemmer.stem(i) for i in doc]     # Stemming (reducing words to their root)
    if not len(doc):                            # dealing with comments that are all emojis, stop words or other languages
        doc = ['emptystring']
    # print('text cleaning done!')
    return ' '.join(doc)

In [51]:
class feature_stacker(BaseEstimator):
    """Stacks several transformer objects to yield concatenated features.
    Similar to pipeline, a list of tuples ``(name, estimator)`` is passed
    to the constructor.
    """
    def __init__(self, transformer_list):
        self.transformer_list = transformer_list

    def get_feature_names(self):
        feature_names = []
        for name, trans in self.transformer_list:
            feature_names.extend(trans.get_feature_names())
        feature_names = [" ".join(w) if isinstance(w, tuple) else w
                            for w in feature_names]
        return np.array(feature_names)

    def fit(self, X, y=None):
        for name, trans in self.transformer_list:
            trans.fit(X, y)
        return self

    def transform(self, X):
        features = []
        for name, trans in self.transformer_list:
            features.append(trans.transform(X))
        issparse = [sparse.issparse(f) for f in features]
        if np.any(issparse):
            features = sparse.hstack(features).tocsr()
        else:
            features = np.hstack(features)
        return features

    def get_params(self, deep=True):
        if not deep:
            return super(feature_stacker, self).get_params(deep=False)
        else:
            out = dict(self.transformer_list)
            for name, trans in self.transformer_list:
                for key, value in trans.get_params(deep=True).items():
                    out['%s__%s' % (name, key)] = value
            return out

In [44]:
vectorizer_word = TfidfVectorizer(lowercase=False,
                                 analyzer=u'word',
                                 ngram_range=(1, 3),
                                 stop_words='english',
                                 binary=False,
                                 norm=u'l2', 
                                 use_idf=True, 
                                 smooth_idf=True, 
                                 sublinear_tf=True,
                                 min_df=3)

In [46]:
vectorizer_char = TfidfVectorizer(lowercase=False,
                                 analyzer=u'char',
                                 ngram_range=(1, 5),
                                 stop_words='english',
                                 binary=False,
                                 norm=u'l2', 
                                 use_idf=True, 
                                 smooth_idf=True, 
                                 sublinear_tf=True)

In [52]:
ft = feature_stacker([("chars", vectorizer_char),
                      ("words", vectorizer_word)])

In [56]:
select = SelectPercentile(score_func=chi2, percentile=1)

## logging in

In [209]:
while True:
    username = input('username: ').lower()
    user = list(db.users.find({'username': username}))
    if not user:
        print('the username doesnt exist, try again')
        user = None
    else:
        pin = base64.b64encode(bytes(str(getpass.getpass('pin: ')), encoding="UTF-8"))
        if not user[0]['pin']==pin:
            print('pin is incorrect, try again')
            user = None
        else:
            break

username: amir
pin: ········


In [210]:
query_results = list(db.likes.find({'user_id':user[0]['_id']}, {'paper_id':1, '_id':0, 'like':1}))
mypaper_ids = [d['paper_id'] for d in query_results]
mylikes = [d['like'] for d in query_results]

In [211]:
documents = [cleaner(' '.join([d['title'], d['summary']])) for d in db.arXivfeeds.find(
                    {'_id': {'$in': mypaper_ids}}, {'_id':0, 'title':1, 'summary':1}
                )]

In [212]:
le = LabelEncoder()
le.fit(mylikes)
Y = le.transform(mylikes) 

In [213]:
ft.fit(documents)
X = ft.transform(documents)
select.fit(X, Y)
X = select.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=.80)

In [214]:
##################
#     XGBoost
##################

dtrain = xgb.DMatrix(X_train, y_train)
dvalid = xgb.DMatrix(X_test, y_test)

params = {
    "objective": "binary:logistic",
    "booster": "gbtree",
    "max_depth": 6,
    "eval_metric": "logloss",
    "eta": 0.1,
    "silent": 1,
    "alpha": 3,
}
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
gbm = xgb.train(params, dtrain, 40, evals=watchlist, verbose_eval=True)

[0]	train-logloss:0.623006	eval-logloss:0.627719
[1]	train-logloss:0.565238	eval-logloss:0.57061
[2]	train-logloss:0.517152	eval-logloss:0.524949
[3]	train-logloss:0.476384	eval-logloss:0.487714
[4]	train-logloss:0.441729	eval-logloss:0.45434
[5]	train-logloss:0.412093	eval-logloss:0.426585
[6]	train-logloss:0.38653	eval-logloss:0.404615
[7]	train-logloss:0.364114	eval-logloss:0.385751
[8]	train-logloss:0.344606	eval-logloss:0.367945
[9]	train-logloss:0.327854	eval-logloss:0.353738
[10]	train-logloss:0.312915	eval-logloss:0.342274
[11]	train-logloss:0.300109	eval-logloss:0.330948
[12]	train-logloss:0.288673	eval-logloss:0.322787
[13]	train-logloss:0.278726	eval-logloss:0.31566
[14]	train-logloss:0.270138	eval-logloss:0.309224
[15]	train-logloss:0.262358	eval-logloss:0.304291
[16]	train-logloss:0.255279	eval-logloss:0.300601
[17]	train-logloss:0.249374	eval-logloss:0.296279
[18]	train-logloss:0.244292	eval-logloss:0.29346
[19]	train-logloss:0.23957	eval-logloss:0.291392
[20]	train-loglo

In [220]:
pred = gbm.predict(xgb.DMatrix(X_test))
print('confusion matrix:')
print(confusion_matrix(y_test, pred>0.2))

confusion matrix:


ValueError: Found input variables with inconsistent numbers of samples: [122, 10]

In [229]:
num_papers_toshow = 10
test_documents = ['___'.join([d['title'], d['summary']]) for d in db.arXivfeeds.aggregate([
            {"$sample": {'size': num_papers_toshow}}
        ])]

In [230]:
X_test = ft.transform([cleaner(d) for d in test_documents])
X_test = select.transform(X_test)
pred = gbm.predict(xgb.DMatrix(X_test))

In [231]:
test_documents = [x for (y,x) in sorted(zip(pred, test_documents))]
pred = np.sort(pred)

In [232]:
for j in range(len(pred)):
    print(pred[j])
    print(test_documents[j].split('___')[0])

0.0603464
Distinguishability of apparatus states in quantum measurement in the
  Stern-Gerlach experiment
0.0603464
Optimal control of time-dependent targets
0.0603464
Peak Doubling in SPDC Coincidence Spectra with a Short-Pulse Pump
0.0603464
The Born rule from a consistency requirement on hidden measurements in
  complex Hilbert space
0.0603464
Tomographic approach to the violation of Bell's inequalities for quantum
  states of two qutrits
0.0622891
Radiative Corrections to Multi-Level Mollow-Type Spectra
0.0686694
Dimensional Crossover in Bragg Scattering from an Optical Lattice
0.104405
Optimal arbitrarily accurate composite pulse sequences
0.159957
Optimal two-qubit gate for generation of random bipartite entanglement
0.232279
Entanglement and nonclassicality in four-mode Gaussian states generated
  via parametric down-conversion and frequency up-conversion
