Rerun LDA
============

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import os
import sys
basedir = '../'
sys.path.append(basedir)

import numpy as np
import pandas as pd
from IPython.display import display
from lda_for_fragments import Ms2Lda

In [2]:
### all the parameters you need to specify to run LDA ###

n_topics = 300 # 300 - 400 topics from cross-validation
n_samples = 1000 # 100 is probably okay for testing. For manuscript, use > 500-1000.
n_burn = 0 # if 0 then we only use the last sample
n_thin = 1 # every n-th sample to use for averaging after burn-in. Ignored if n_burn = 0
alpha = 50.0/n_topics # hyper-parameter for document-topic distributions
beta = 0.1 # hyper-parameter for topic-word distributions

In [3]:
def run_lda(fragment_filename, neutral_loss_filename, ms1_filename, ms2_filename, outfile):
        
    ms2lda = Ms2Lda.lcms_data_from_R(fragment_filename, neutral_loss_filename, None, 
                                 ms1_filename, ms2_filename)
    ms2lda.run_lda_gibbs(n_topics, n_samples, n_burn, n_thin, alpha, beta)

    max_occurrences = {'N':6, 'S': 2, 'P': 2, 'C13':1, 'F':0, 'Cl':0}
    n_stages = 2
    tol = 3
    ms2lda.annotate_peaks(mode='pos', target='ms1', ppm=3, max_mass=400, 
                          rule_8_max_occurrences=max_occurrences, n_stages=n_stages)

    max_occurrences = {'N':6, 'S': 2, 'P': 2, 'C13':1, 'F':0, 'Cl':0}
    n_stages = 1
    tol = [(70, 10), (200, 5)]

    # annotate the elemental formulae of MS2 fragments
    ms2lda.annotate_peaks(mode='pos', target='ms2_fragment', ppm=tol, max_mass=200, 
                          rule_8_max_occurrences=max_occurrences, n_stages=n_stages)

    # we can also annotate the neutral losses
    max_occurrences = {'N':6, 'S': 2, 'P': 2, 'C13':1, 'F':0, 'Cl':0}
    n_stages = 1
    tol = [(40, 20), (200, 10)]
    ms2lda.annotate_peaks(mode='none', target='ms2_loss', ppm=tol, max_mass=200, n_stages=n_stages, rule_8_max_occurrences=max_occurrences)

    # leave the message parameter out if nothing to say
    ms2lda.save_project(outfile, message="Re-running LDA and storing the Z")    

In [None]:
fragment_filename = basedir + 'input/final/Beer1pos_MS1filter_Method3_fragments.csv'
neutral_loss_filename = basedir + 'input/final/Beer1pos_MS1filter_Method3_losses.csv'
ms1_filename = basedir + 'input/final/Beer1pos_MS1filter_Method3_ms1.csv'
ms2_filename = basedir + 'input/final/Beer1pos_MS1filter_Method3_ms2.csv'
outfile = 'results/beer1pos_rerun.project'

run_lda(fragment_filename, neutral_loss_filename, ms1_filename, ms2_filename, outfile)

Loading input files
Data shape (1282, 4237)
Fitting model with collapsed Gibbs sampling
CGS LDA initialising
.................................................................................................................................
Using Numba for LDA sampling
Numba not found. Using Numpy for LDA sampling
Sample 1 ..............................................................................................

In [4]:
fragment_filename = basedir + 'input/final/Beer2pos_MS1filter_Method3_fragments.csv'
neutral_loss_filename = basedir + 'input/final/Beer2pos_MS1filter_Method3_losses.csv'
ms1_filename = basedir + 'input/final/Beer2pos_MS1filter_Method3_ms1.csv'
ms2_filename = basedir + 'input/final/Beer2pos_MS1filter_Method3_ms2.csv'
outfile = 'results/beer2pos_rerun.project'

run_lda(fragment_filename, neutral_loss_filename, ms1_filename, ms2_filename, outfile)

Loading input files
Data shape (1567, 4975)
Fitting model...
CGS LDA initialising
.............................................................................................................................................................
Using Numba for LDA sampling
Preparing words
Preparing Z matrix
DONE
Sample 1   Log likelihood = -9293032.808 
Sample 2   Log likelihood = -6963183.503 
Sample 3   Log likelihood = -5270941.393 
Sample 4   Log likelihood = -4550358.901 
Sample 5   Log likelihood = -4258307.411 
Sample 6   Log likelihood = -4105973.140 
Sample 7   Log likelihood = -4012507.512 
Sample 8   Log likelihood = -3944620.313 
Sample 9   Log likelihood = -3896376.724 
Sample 10   Log likelihood = -3854850.358 
Sample 11   Log likelihood = -3823519.029 
Sample 12   Log likelihood = -3800308.676 
Sample 13   Log likelihood = -3778291.734 
Sample 14   Log likelihood = -3759695.574 
Sample 15   Log likelihood = -3740227.455 
Sample 16   Log likelihood = -3724696.731 
Sample 17   



In [None]:
fragment_filename = basedir + 'input/final/Beer3pos_MS1filter_Method3_fragments.csv'
neutral_loss_filename = basedir + 'input/final/Beer3pos_MS1filter_Method3_losses.csv'
ms1_filename = basedir + 'input/final/Beer3pos_MS1filter_Method3_ms1.csv'
ms2_filename = basedir + 'input/final/Beer3pos_MS1filter_Method3_ms2.csv'
outfile = 'results/beer3pos_rerun.project'

run_lda(fragment_filename, neutral_loss_filename, ms1_filename, ms2_filename, outfile)