In [2]:
import import_ipynb
from Univariate_Analysis import *

# 6. Stacking all the Features together

In [4]:
from scipy.sparse import hstack

In [5]:
train_gene_var_onehotCoding = hstack((train_gene_feature_onehotCoding,train_variation_feature_onehotCoding))
test_gene_var_onehotCoding = hstack((test_gene_feature_onehotCoding,test_variation_feature_onehotCoding))
cv_gene_var_onehotCoding = hstack((cv_gene_feature_onehotCoding,cv_variation_feature_onehotCoding))

train_x_onehotCoding = hstack((train_gene_var_onehotCoding, train_text_feature_onehotCoding)).tocsr()
train_y = np.array(list(x_train['Class']))

test_x_onehotCoding = hstack((test_gene_var_onehotCoding, test_text_feature_onehotCoding)).tocsr()
test_y = np.array(list(x_test['Class']))

cv_x_onehotCoding = hstack((cv_gene_var_onehotCoding, cv_text_feature_onehotCoding)).tocsr()
cv_y = np.array(list(x_cv['Class']))


train_gene_var_responseCoding = np.hstack((train_gene_feature_responseCoding,train_variation_feature_responseCoding))
test_gene_var_responseCoding = np.hstack((test_gene_feature_responseCoding,test_variation_feature_responseCoding))
cv_gene_var_responseCoding = np.hstack((cv_gene_feature_responseCoding,cv_variation_feature_responseCoding))

train_x_responseCoding = np.hstack((train_gene_var_responseCoding, train_text_feature_responseCoding))
test_x_responseCoding = np.hstack((test_gene_var_responseCoding, test_text_feature_responseCoding))
cv_x_responseCoding = np.hstack((cv_gene_var_responseCoding, cv_text_feature_responseCoding))

### 6.1 One Hot Encoding of Features

In [6]:
print("(number of data points * number of features) in train data = ", train_x_onehotCoding.shape)
print("(number of data points * number of features) in test data = ", test_x_onehotCoding.shape)
print("(number of data points * number of features) in cross validation data =", cv_x_onehotCoding.shape)

(number of data points * number of features) in train data =  (2124, 55019)
(number of data points * number of features) in test data =  (665, 55019)
(number of data points * number of features) in cross validation data = (532, 55019)


### 6.2 Response Encoding of Features

In [7]:
print("(number of data points * number of features) in train data = ", train_x_responseCoding.shape)
print("(number of data points * number of features) in test data = ", test_x_responseCoding.shape)
print("(number of data points * number of features) in cross validation data =", cv_x_responseCoding.shape)

(number of data points * number of features) in train data =  (2124, 27)
(number of data points * number of features) in test data =  (665, 27)
(number of data points * number of features) in cross validation data = (532, 27)


One hot encoding has high dimension than Response encoding. In Response encoding each of the three features are 9 dimensional. The algorithms that GENERALLY work well with HIGH dimensional data are Linear SVM, Logistic Regression, Naive Baye's and the algorithms that generally work well with low dimensional data are Decision Tree, KNN, Random Forest.

# 7. Model Functions

In [8]:
def predict_and_plot_confusion_matrix(train_x, train_y,test_x, test_y, algo):
    algo.fit(train_x, train_y)
    sig_algo = CalibratedClassifierCV(algo, method="sigmoid")
    sig_algo.fit(train_x, train_y)
    pred_y = sig_algo.predict(test_x)

    print("Log loss :",log_loss(test_y, sig_algo.predict_proba(test_x)))

    print("Number of mis-classified points :", np.count_nonzero((pred_y- test_y))/test_y.shape[0])
    plot_confusion_matrix(test_y, pred_y)

In [9]:
def report_log_loss(train_x, train_y, test_x, test_y,  algo):
    algo.fit(train_x, train_y)
    sig_algo = CalibratedClassifierCV(algo, method="sigmoid")
    sig_algo.fit(train_x, train_y)
    sig_algo_probs = sig_algo.predict_proba(test_x)
    return log_loss(test_y, sig_algo_probs, eps=1e-15)

#### The below function is specific to Naive Bayes algorithm only. For given indices, we will print the name of the features and check whether the feature is present in the test point's Text or not.

In [10]:
def get_impfeature_names(indices, text, gene, var, no_features):
    gene_count_vec = CountVectorizer()
    var_count_vec = CountVectorizer()
    text_count_vec = CountVectorizer(min_df=3)
    
    gene_vec = gene_count_vec.fit(x_train['Gene'])
    var_vec  = var_count_vec.fit(x_train['Variation'])
    text_vec = text_count_vec.fit(x_train['TEXT'])
    
    fea1_len = len(gene_vec.get_feature_names())
    fea2_len = len(var_count_vec.get_feature_names())
    
    word_present = 0
    for i,v in enumerate(indices):
        if (v < fea1_len):
            word = gene_vec.get_feature_names()[v]
            yes_no = True if word == gene else False
            if yes_no:
                word_present += 1
                print(i, "Gene feature [{}] present in test data point [{}]".format(word,yes_no))
        elif (v < fea1_len+fea2_len):
            word = var_vec.get_feature_names()[v-(fea1_len)]
            yes_no = True if word == var else False
            if yes_no:
                word_present += 1
                print(i, "variation feature [{}] present in test data point [{}]".format(word,yes_no))
        else:
            word = text_vec.get_feature_names()[v-(fea1_len+fea2_len)]
            yes_no = True if word in text.split() else False
            if yes_no:
                word_present += 1
                print(i, "Text feature [{}] present in test data point [{}]".format(word,yes_no))

    print("Out of the top ",no_features," features ", word_present, "are present in query point")