In [1]:
import pandas as pd
import numpy as np

#### Read Data

In [2]:
df = pd.read_csv('D:\FirstSemMT\ML\Assignments\leukemia.tab',sep='\t')
df.shape

(75, 5148)

In [3]:
df.head()

Unnamed: 0,gene,AFFX-BioC-5_at,hum_alu_at,AFFX-DapX-M_at,AFFX-LysX-5_at,AFFX-HUMISGF3A/M97935_MA_at,AFFX-HUMISGF3A/M97935_MB_at,AFFX-HUMISGF3A/M97935_3_at,AFFX-HUMRGE/M10098_5_at,AFFX-HUMRGE/M10098_M_at,...,M93143_at,U29175_at,U48730_at,U58516_at,X06956_at,X83863_at,Z17240_at,L49218_f_at,M71243_f_at,Z78285_f_at
0,discrete,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,...,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous
1,class,,,,,,,,,,...,,,,,,,,,,
2,ALL,88,15091,311,21,-13,215,797,14538,9738,...,384,1582,185,511,389,793,329,36,191,-37
3,ALL,283,11038,134,-21,-219,116,433,615,115,...,231,624,169,837,442,782,295,11,76,-14
4,ALL,309,16692,378,67,104,476,1474,5669,3272,...,720,753,315,1199,168,1138,777,41,228,-41


In [4]:
df.drop(df.index[df['gene'] == 'discrete'], inplace = True)
df.drop(df.index[df['gene'] == 'class'], inplace = True)
df.drop(df.index[72], inplace = True)
label = df['gene'] 
df.drop(['gene'], axis = 1, inplace = True)
df = df.values
samples,features = df.shape
print(samples,features)

72 5147


In [5]:
data = df.astype(np.float)
data.shape

(72, 5147)

#### Import train_test_split function and Split dataset into training set and test set

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2,random_state=100) 

#### Find f value for every feature 

In [7]:
total_class = 2

#degree of freedom
dof_between = total_class-1
dof_within = data.shape[0]-total_class
dof_total = dof_between+dof_within

Dof_critical = 3.994  #for alpha=0.05

In [8]:
c1 = np.where(y_train=='ALL')[0]
c2 = np.where(y_train=='AML')[0]

#to calculate no. of samples in both the class
c_dash1 = np.where(label=='ALL')[0]
c_dash2 = np.where(label=='AML')[0]

#mean of each class
m1 = X_train[c1,:].mean(axis=0)
m2 = X_train[c2,:].mean(axis=0)
m_grand = X_train[:,:].mean(axis=0)

#variance of each class
v1 = X_train[c1,:].var(axis=0)
v2 = X_train[c2,:].var(axis=0)

ss_total = np.sum((X_train[:,:] - m_grand)**2, axis=0)
print(ss_total.shape)

ss_within = (c_dash1.shape[0]-1)*v1 + (c_dash2.shape[0]-1)*v2
ss_between = ((c_dash1.shape[0])*(m1-m_grand)**2) + ((c_dash2.shape[0])*(m2-m_grand)**2)
print(ss_between.shape)

ms_between = ss_between/dof_between
ms_within = ss_within/dof_within

#f-stat
f = ms_between/ms_within
print(f)
print(f[f>Dof_critical].shape)

(5147,)
(5147,)
[7.42940943 3.59145593 0.96914148 ... 2.73153409 7.96942712 0.54250657]
(1983,)


#### Sort indexes in descending order of f value

In [9]:
sort_idxs=(-f).argsort()[:f.shape[0]]
sort_idxs

array([3544, 1269, 1374, ..., 4943, 2197, 1238], dtype=int64)

#### Select top features (less than 20%)

In [10]:
filt_idxs = sort_idxs[0:int((data.shape[1])*0.2)]
filt_idxs

array([3544, 1269, 1374, ...,  471, 4130, 1366], dtype=int64)

In [11]:
X_new_train = X_train[:,filt_idxs]
X_new_test = X_test[:,filt_idxs]
X_new_train.shape

(57, 1029)

#### KNN Classification

In [12]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_new_train, y_train)
y_pred = knn.predict(X_new_test)

#### Accuracy, F score and Confusion matrix using KNN

In [13]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.8666666666666667
f1 score: 0.8295454545454545
confusion matrix: [[10  0]
 [ 2  3]]


#### SNM Classification

In [14]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_new_train, y_train)
y_pred = clf.predict(X_new_test)

#### Accuracy, F score and Confusion matrix using SVM

In [15]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 1.0
f1 score: 1.0
confusion matrix: [[10  0]
 [ 0  5]]
