In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
table = pd.read_table('./1663769555_8559356_train.txt', names=['binding','sequence'])

In [72]:
from sklearn.feature_extraction.text import CountVectorizer
def return_binary_vectorized_form(data_frame):
    count_vectorizer = CountVectorizer(binary=True)
    fitted_and_transformed = count_vectorizer.fit_transform(data_frame).toarray()
    return count_vectorizer,  fitted_and_transformed

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
def return_binary_vectorized_form_with_vocab(data_frame, vocab):
    count_vectorizer = CountVectorizer(binary=True, vocabulary=vocab)
    fitted_and_transformed = count_vectorizer.fit_transform(data_frame).toarray()
    return count_vectorizer,  fitted_and_transformed

In [74]:
from sklearn.feature_selection import SelectKBest, chi2
def select_k_best_features(feature_set, target_values, fraction, feature_list):
    select_bestselect_best = SelectKBest(score_func=chi2, k = int(len(feature_list)*fraction))
    select_bestselect_best.fit(feature_set,target_values)
    mask = select_bestselect_best.get_support()
    k_best_featurs = feature_list[mask]
    return k_best_featurs

In [75]:
def collect_with_target_value(data_frame, column, target_value):
    collected_values = data_frame.loc[data_frame[column] == target_value]
    return collected_values

In [76]:
def resample(data_frame, fraction):
    sampled_df = pd.DataFrame.sample(data_frame,frac=fraction, random_state= np.random.RandomState())
    return sampled_df

In [77]:
active_df = collect_with_target_value(table, 'binding', 1)
not_active_dfs = collect_with_target_value(table, 'binding',0)

In [78]:
not_active_dfs = not_active_dfs.reset_index()
active_df = active_df.reset_index()

In [91]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import  precision_recall_fscore_support
kf = KFold(n_splits=14)
for train_index, test_index in kf.split(not_active_dfs.binding):
    # print(train_index)
    not_active_sequence_test, not_active_sequence_train = not_active_dfs.sequence.loc[train_index],not_active_dfs.sequence.loc[test_index]
    not_active_binding_test, not_active_binding_train = not_active_dfs.binding.loc[train_index],not_active_dfs.binding.loc[test_index]
    active_sequence_train, active_sequence_test, active_binding_train, active_binding_test = train_test_split(active_df['sequence'],active_df['binding'], shuffle=True, train_size=.70, random_state=0)
    combinded_train_sequence = pd.concat([not_active_sequence_train, active_sequence_train])
    combinded_train_binding = pd.concat([not_active_binding_train, active_binding_train])
    combined_test_sequence = pd.concat([not_active_sequence_test, active_sequence_test])
    combined_test_binding = pd.concat([not_active_binding_test, active_binding_test])
    vectorizer, combined_features = return_binary_vectorized_form(combinded_train_sequence)
    k_best_features = select_k_best_features(combined_features, combinded_train_binding, 0.25, vectorizer.get_feature_names_out())
    final_vectorizer ,final_train_data = return_binary_vectorized_form_with_vocab(combinded_train_sequence, k_best_features)
    final_test_data = final_vectorizer.transform(combined_test_sequence)
    dt_clf = DecisionTreeClassifier(criterion='gini')
    path = dt_clf.cost_complexity_pruning_path(final_train_data, combinded_train_binding)
    alphas, impurity = path.ccp_alphas, path.impurities
    alphas = alphas[:-1]
    clfs = []
    for alpha in alphas:
        d_tree = DecisionTreeClassifier(criterion='gini',random_state=0, ccp_alpha=alpha)
        d_tree.fit(final_train_data, combinded_train_binding)
        clfs.append(d_tree)
    score = []
    for tree in clfs:
        predict = tree.predict(final_test_data)
        score.append(precision_recall_fscore_support(combined_test_binding,predict,average='weighted',beta=1.2))
    print(score)

[(0.9586052589185029, 0.8429394812680115, 0.8782801879100972, None), (0.9586052589185029, 0.8429394812680115, 0.8782801879100972, None), (0.9584825786058355, 0.840057636887608, 0.8762265710279807, None), (0.9602755610789547, 0.8386167146974063, 0.8752093507587534, None), (0.9602755610789547, 0.8386167146974063, 0.8752093507587534, None), (0.9606366718767809, 0.8472622478386167, 0.8813882203116825, None), (0.9735812793299494, 0.968299711815562, 0.969996798594175, None)]
[(0.9583721551717911, 0.7838616714697406, 0.835530395299016, None), (0.9592581458928734, 0.8112391930835735, 0.8555078671119986, None), (0.9647398090282918, 0.9293948126801153, 0.9400283314648911, None), (0.9730091156015549, 0.9668587896253602, 0.9688122735853069, None)]
[(0.9578468492238785, 0.8242074927953891, 0.8648926877256986, None), (0.9597194656605814, 0.8242074927953891, 0.8648693632249133, None), (0.9597194656605814, 0.8242074927953891, 0.8648693632249133, None), (0.9596140771080207, 0.8213256484149856, 0.862793

array([0.        , 0.0184127 , 0.01865889, 0.02539683, 0.03512262,
       0.03778073, 0.05012531, 0.16371882])

In [10]:
balanced_and_transformed = [return_binary_vectorized_form(balanced_dfs[i]['sequence']) for i in range(20) ]
k_best_feature = [k_best_features(balanced_and_transformed[i][1],balanced_dfs[i]['binding'],.80,balanced_and_transformed[i][0].get_feature_names_out()) for i in range(20)]
# balanced_vectorizer, samples_balanced = return_binary_vectorized_form(balanced_df['sequence'])
# k_best_feature = k_best_features(samples_balanced,balanced_df['binding'],.50,balanced_vectorizer.get_feature_names_out())
# k_best_feature

In [11]:
from sklearn.model_selection import train_test_split
train_test = [ train_test_split(balanced_dfs[i]['sequence'],balanced_dfs[i]['binding'] ,train_size=.70, random_state=0) for i in range(20)]

In [12]:
train_features = [return_binary_vectorized_form_with_vocab(train_test[i][0],k_best_feature[i]) for i in range(20)]
test_transformed = [train_features[i][0].transform(train_test[i][1]) for i in range(20)]

In [13]:
# y_predict = bag_clf.predict(x_test_features)
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = [DecisionTreeClassifier(criterion='gini') for i in range(20)]
alpha_clf = [(bag_clf[i].cost_complexity_pruning_path(train_features[i][1], train_test[i][1])) for i in range(20)]
fit_clf = [ bag_clf[i].fit(train_features[i][1], train_test[i][2]) for i in range(20)]
predict_test = [ bag_clf[i].predict(test_transformed[i]) for i in range(20)]

In [14]:
from sklearn.metrics import accuracy_score
accuracy_score = [ accuracy_score(predict_test[i], train_test[i][3]) for i in range(20)]

[0.6808510638297872,
 0.6808510638297872,
 0.6808510638297872,
 0.6595744680851063,
 0.7872340425531915,
 0.7872340425531915,
 0.574468085106383,
 0.7446808510638298,
 0.8085106382978723,
 0.6382978723404256,
 0.723404255319149,
 0.8297872340425532,
 0.6808510638297872,
 0.723404255319149,
 0.8085106382978723,
 0.7021276595744681,
 0.723404255319149,
 0.7021276595744681,
 0.7659574468085106,
 0.7446808510638298]

In [437]:
predictions = pd.DataFrame(np.array(predict_test))

In [438]:
table_test = pd.read_table('./1663769555_8653905_test.txt', names=['sequence'])

In [439]:
unlabled_test_transformed = [train_features[i][0].transform(table_test.sequence) for i in range(20)]

In [440]:
unlabled_predict_test = [ bag_clf[i].predict(unlabled_test_transformed[i]) for i in range(20)]

In [441]:
unlabled_predict_test = np.array(unlabled_predict_test)

In [442]:
unlabled_prediction_df =  pd.DataFrame(unlabled_predict_test)
unlabled_prediction_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,340,341,342,343,344,345,346,347,348,349
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
7,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [443]:
x = np.array(unlabled_prediction_df.columns)
majority = []
for i in range(len(x)):
    p = unlabled_prediction_df[x[i]].mode()
    if len(p)==2:
        majority.append(1)
    else:
        majority.append(p[0])

In [444]:
majority[25]

1

In [445]:
unlabled_prediction_df[53].mode()

0    1
Name: 53, dtype: int64

In [446]:
import csv
outfile = open('./results.csv','w')
out = csv.writer(outfile)
out.writerows(map(lambda x: [x], majority))
outfile.close()