In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [58]:
table = pd.read_table('./1663769555_8559356_train.txt', names=['binding','sequence'])

In [107]:
from sklearn.feature_extraction.text import CountVectorizer
def return_binary_vectorized_form(data_frame):
    count_vectorizer = CountVectorizer(binary=True)
    fitted_and_transformed = count_vectorizer.fit_transform(data_frame).toarray()
    return count_vectorizer,  fitted_and_transformed

In [127]:
from sklearn.feature_extraction.text import CountVectorizer
def return_binary_vectorized_form_with_vocab(data_frame, vocab):
    count_vectorizer = CountVectorizer(binary=True, vocabulary=vocab)
    fitted_and_transformed = count_vectorizer.fit_transform(data_frame).toarray()
    return count_vectorizer,  fitted_and_transformed

In [60]:
from sklearn.feature_selection import SelectKBest, chi2
def k_best_features(feature_set, target_values, fraction, feature_list):
    select_bestselect_best = SelectKBest(score_func=chi2, k = int(len(feature_list)*fraction))
    select_bestselect_best.fit(feature_set,target_values)
    mask = select_bestselect_best.get_support()
    k_best_featurs = feature_list[mask]
    return k_best_featurs

In [61]:
def collect_with_target_value(data_frame, column, target_value):
    collected_values = data_frame.loc[data_frame[column] == target_value]
    return collected_values

In [62]:
def resample(data_frame, fraction):
    sampled_df = pd.DataFrame.sample(data_frame,frac=fraction, random_state= np.random.RandomState())
    return sampled_df

In [99]:
active_df = collect_with_target_value(table, 'binding', 1)
not_actievev_df = resample(collect_with_target_value(table, 'binding',0), .15)

In [101]:
balanced_df = pd.concat([active_df,not_actievev_df])
balanced_df= balanced_df.sample(frac=1).reset_index(drop=True)
balanced_vectorizer, samples_balanced = return_binary_vectorized_form(balanced_df,'sequence')
k_best_feature = k_best_features(samples_balanced,balanced_df['binding'],.25,balanced_vectorizer.get_feature_names_out())

In [118]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(balanced_df['sequence'],balanced_df['binding'] ,train_size=.70, random_state=0)

In [129]:
vectorizer , x_train_features = return_binary_vectorized_form_with_vocab(x_train, k_best_feature)
x_test_features = vectorizer.transform(x_test).toarray()
x_test_features

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [134]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(),n_estimators=1000, max_samples=95,random_state=42, n_jobs=2)
bag_clf.fit(x_train_features, np.array(y_train))

In [135]:
y_predict = bag_clf.predict(x_test_features)

In [136]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predict)

0.8571428571428571