In [25]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.metrics import precision_score
import os
import gzip
import random

In [26]:
#Returns a string of length bytes long
def dataloader(filepath, length):
    f = gzip.GzipFile(fileobj=open(filepath, 'rb'))
    data = f.read(length)
    return data.decode("utf-8")

In [84]:
#Splits the data into num_chunks
def chunk(in_string,num_chunks):
    chunk_size = len(in_string)//num_chunks
    if len(in_string) % num_chunks: chunk_size += 1
    iterator = iter(in_string)
    for _ in range(num_chunks):
        accumulator = list()
        for _ in range(chunk_size):
            try: accumulator.append(next(iterator))
            except StopIteration: break
        yield ''.join(accumulator)

In [131]:
def FileClassifier(Dataset, ngram_range, max_features, param, num_round):
    
    X = Dataset.iloc[:,0]
    y = Dataset.iloc[:,1]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
    char_vectorizer = TfidfVectorizer(analyzer = 'char',
        ngram_range = ngram_range, max_features = max_features)
    char_vectorizer.fit(X)
    train_chars = char_vectorizer.transform(X_train)
    test_chars = char_vectorizer.transform(X_test)
    
    dtrain = xgb.DMatrix(train_chars, label=y_train)
    dtest = xgb.DMatrix(test_chars, label=y_test)
    
    model = xgb.train(param, dtrain, num_round)
    preds = model.predict(dtest)
    best_preds = np.asarray([np.argmax(line) for line in preds])
    print(precision_score(y_test, best_preds, average='macro'))
    print(np.asarray(y_test).reshape(1,-1))
    
    return(best_preds)

In [113]:
data = dataloader('/Users/bf/Desktop/BNL2020/BioClassifierFiles/GCA_902387845.1_UHGG_MGYG-HGUT-02512_genomic.fna.gz', 40000)
data1 = dataloader('/Users/bf/Desktop/BNL2020/BioClassifierFiles/SRR9259133.fastq.gz', 40000)
data2 = dataloader('/Users/bf/Desktop/BNL2020/BioClassifierFiles/GCA_002097535.1_ASM209753v1_genomic.gff.gz', 40000)
data3 = dataloader('/Users/bf/Desktop/BNL2020/BioClassifierFiles/GCA_003568845.1_ASM356884v1_genomic.gbff.gz', 40000)

In [114]:
#Split the 40000 byte data into 1000 byte pieces
FastDat = list(chunk(data,40)) + list(chunk(data1, 40)) + list(chunk(data2, 40)) + list(chunk(data3, 40))
data = {'FastDat': FastDat,
        'Type': np.concatenate((np.repeat(0,40),np.repeat(1,40), np.repeat(2,40), np.repeat(3,40)))
    }
df = pd.DataFrame(data, columns = ['FastDat', 'Type'])


In [122]:
param = {
    'max_depth': 4,  # the maximum depth of each tree
    'eta': 0.2,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 4}  # the number of classes that exist in this datset
num_round = 40 
ngram_range = (2,4)
max_features = 10000
Dataset = df
max_len = 20

In [132]:
FileClassifier(Dataset, ngram_range, max_features, param, num_round)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1.0
[[2 2 3 1 2 0 2 1 2 3 0 2 0 1 0 0 3 3 2 0 1 0 0 0 0 3 2 1 3 2 3 1]]


array([2, 2, 3, 1, 2, 0, 2, 1, 2, 3, 0, 2, 0, 1, 0, 0, 3, 3, 2, 0, 1, 0,
       0, 0, 0, 3, 2, 1, 3, 2, 3, 1])