In [1]:
import pandas as pd
import numpy as np

#### Read Data

In [2]:
df = pd.read_csv('D:\FirstSemMT\ML\Assignments\leukemia.tab',sep='\t')
df.shape

(75, 5148)

In [3]:
df.head()

Unnamed: 0,gene,AFFX-BioC-5_at,hum_alu_at,AFFX-DapX-M_at,AFFX-LysX-5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMRGE/M10098_5_at,AFFX-HUMRGE/M10098_M_at,...,M93143_at,U29175_at,U48730_at,U58516_at,X06956_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,discrete,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,...,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous
1,class,,,,,,,,,,...,,,,,,,,,,
2,ALL,88,15091,311,21,-13,215,797,14538,9738,...,384,1582,185,511,389,793,329,36,191,-37
3,ALL,283,11038,134,-21,-219,116,433,615,115,...,231,624,169,837,442,782,295,11,76,-14
4,ALL,309,16692,378,67,104,476,1474,5669,3272,...,720,753,315,1199,168,1138,777,41,228,-41


In [4]:
df.drop(df.index[df['gene'] == 'discrete'], inplace = True)
df.drop(df.index[df['gene'] == 'class'], inplace = True)
df.drop(df.index[72], inplace = True)
label = df['gene'] 
df.drop(['gene'], axis = 1, inplace = True)
df = df.values
samples,features = df.shape
print(samples,features)

72 5147


In [5]:
data = df.astype(np.float)
data.shape

(72, 5147)

#### Import train_test_split function and Split dataset into training set and test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2,random_state=100) 

#### Find T score for every feature 

In [7]:
c1 = np.where(y_train=='ALL')[0]
c2 = np.where(y_train=='AML')[0]
m1 = X_train[c1,:].mean(axis=0)
m2 = X_train[c2,:].mean(axis=0)
v1 = X_train[c1,:].var(axis=0)
v2 = X_train[c2,:].var(axis=0)
tscore = np.abs(m1-m2)/np.sqrt(v1/c1.shape[0] + v2/c2.shape[0])

#### Sort indexes in descending order of T score

In [8]:
sort_idxs=(-tscore).argsort()[:tscore.shape[0]]
sort_idxs

array([1726, 3150, 4968, ..., 4943, 1238, 2197], dtype=int64)

#### Select top features (less than 20%)

In [9]:
filt_idxs = sort_idxs[0:int((data.shape[1])*0.02)]

In [10]:
X_new_train = X_train[:,filt_idxs]
X_new_test = X_test[:,filt_idxs]
X_new_train.shape

(57, 102)

In [11]:
!pip install mlxtend --user



In [12]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics

### Forward Feature Selection with KNN Classifier

In [13]:
sfs = SFS(KNeighborsClassifier(n_neighbors=5),
         k_features = 70,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_new_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:    4.5s finished

[2020-10-19 01:26:47] Features: 1/70 -- score: 0.9642857142857143[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:    0.7s finished

[2020-10-19 01:26:47] Features: 2/70 -- score: 0.9642857142857144[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.7s finished

[2020-10-19 01:26:48] Features: 3/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:    0.7s finished

[2020-10-19 01:26:49] Featu

[Parallel(n_jobs=-1)]: Done  60 out of  67 | elapsed:    0.5s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  67 out of  67 | elapsed:    0.6s finished

[2020-10-19 01:27:14] Features: 36/70 -- score: 1.0[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 out of  66 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  66 out of  66 | elapsed:    0.7s finished

[2020-10-19 01:27:15] Features: 37/70 -- score: 1.0[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 out of  65 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  65 out of  65 | elapsed:    0.9s finished

[2020-10-19 01:27:16] Features: 38/70 -- score: 1.0[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  64 out of  64 | elapsed:    0.5s finished

[2020-10-19 01:27:16] Features: 39/70 -- score: 1.0[Parallel(n_jobs=-1)]: Using backend 

In [14]:
X_train_sfs = sfs.transform(X_new_train)
X_test_sfs = sfs.transform(X_new_test)

In [15]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sfs, y_train)
y_pred = knn.predict(X_test_sfs)

In [16]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.9333333333333333
f1 score: 0.9282296650717703
confusion matrix: [[9 1]
 [0 5]]


### Forward Feature Selection with SVM Classifier

In [17]:
sfs1 = SFS(svm.SVC(kernel='linear'),
         k_features = 70,
          forward= True,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_new_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:  4.8min finished

[2020-10-19 01:33:39] Features: 1/70 -- score: 0.9642857142857143[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  40 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:  5.3min finished

[2020-10-19 01:38:55] Features: 2/70 -- score: 0.9642857142857143[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  41 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed: 15.1min finished

[2020-10-19 01:54:02] Features: 3/70 -- score: 0.9642857142857143[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:    0.4s finished

[2020-10-19 01:54:02] Featu

[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:    0.4s finished

[2020-10-19 01:54:23] Features: 33/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  62 out of  69 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  69 out of  69 | elapsed:    0.4s finished

[2020-10-19 01:54:23] Features: 34/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  68 out of  68 | elapsed:    0.3s finished

[2020-10-19 01:54:23] Features: 35/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  67 out of  67 | elapsed:    0.3s finished

[2020-10-19 01:54:24] Features: 36/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  59 out of  66 | elapsed:    0.4s r

[Parallel(n_jobs=-1)]: Done  33 out of  33 | elapsed:    0.3s finished

[2020-10-19 01:54:46] Features: 70/70 -- score: 0.9821428571428572

In [18]:
X_train_sfs1 = sfs1.transform(X_new_train)
X_test_sfs1 = sfs1.transform(X_new_test)

In [19]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train_sfs1, y_train)
y_pred = clf.predict(X_test_sfs1)

In [20]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.9333333333333333
f1 score: 0.9282296650717703
confusion matrix: [[9 1]
 [0 5]]


### Backward Feature Selection with KNN Classifier

In [21]:
sbs = SFS(KNeighborsClassifier(n_neighbors=5),
         k_features = 70,
          forward= False,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_new_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:    0.7s finished

[2020-10-19 01:55:42] Features: 101/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:    0.8s finished

[2020-10-19 01:55:43] Features: 100/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished

[2020-10-19 01:55:44] Features: 99/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  83 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done  99 

In [22]:
X_train_sbs = sbs.transform(X_new_train)
X_test_sbs = sbs.transform(X_new_test)

In [23]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sbs, y_train)
y_pred = knn.predict(X_test_sbs)

In [24]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.9333333333333333
f1 score: 0.9206349206349207
confusion matrix: [[10  0]
 [ 1  4]]


### Backward Feature Selection with SVM Classifier

In [25]:
sbs1 = SFS(svm.SVC(kernel='linear'),
         k_features = 70,
          forward= False,
          floating = False,
          verbose= 2,
          scoring= 'accuracy',
          cv = 4,
          n_jobs= -1
         ).fit(X_new_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  86 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 102 out of 102 | elapsed:    0.5s finished

[2020-10-19 01:56:54] Features: 101/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  81 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 101 out of 101 | elapsed:    0.5s finished

[2020-10-19 01:56:54] Features: 100/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  93 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.8s finished

[2020-10-19 01:56:55] Features: 99/70 -- score: 0.9821428571428572[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  99 out of  99 | elapsed:    0.7s finished

[2020-10-19 01:56:56] 

In [26]:
X_train_sbs1 = sbs1.transform(X_new_train)
X_test_sbs1 = sbs1.transform(X_new_test)

In [27]:
clf = svm.SVC(kernel='linear')
clf.fit(X_train_sbs1, y_train)
y_pred = clf.predict(X_test_sbs1)

In [28]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.8
f1 score: 0.7963800904977376
confusion matrix: [[7 3]
 [0 5]]
