In [27]:
import pandas as pd
import numpy as np

#### Read Data

In [28]:
df = pd.read_csv('D:\FirstSemMT\ML\Assignments\lung.tab',sep='\t', low_memory=False)
df.shape

(205, 12601)

In [29]:
df.head()

Unnamed: 0,class,38691_s_at,37864_s_at,33273_f_at,33274_f_at,33501_r_at,33500_i_at,33499_s_at,41164_at,38194_s_at,...,41848_f_at,32086_at,33886_at,31781_at,AFFX-BioC-3_at,41422_at,39964_at,36120_at,40571_at,36312_at
0,discrete,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,...,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous,continuous
1,class,,,,,,,,,,...,,,,,,,,,,
2,AD,63.2,4196.25,3306.35,3330.86,1609.47,1597.32,1233.89,255.14,3036.53,...,-17.79,18.63,51.04,-13.74,-29.12,-28.31,20.25,28.35,4.06,-19.41
3,AD,965.47,6207.61,7077.04,6968.59,6569.86,6419.19,6908.34,4785.76,4562.19,...,-5.74,5.94,28.23,-4.68,-13.18,-13.18,21.86,9.12,11.24,8.06
4,AD,2940.51,6858.12,6927.79,6495.99,5273.47,4672.48,5474.67,2140.99,5120.39,...,-17.225,4.725,17.28,-6.59,-17.97,-16.07,10.195,17.285,6.92,-11.09


In [30]:
df.drop(df.index[df[df.columns[0]] == 'discrete'], inplace = True)
df.drop(df.index[df[df.columns[0]] == 'class'], inplace = True)
label = df[df.columns[0]] 
df.drop([df.columns[0]], axis = 1, inplace = True)
df = df.values
samples,features = df.shape
print(samples,features)

203 12600


In [31]:
data = df.astype(np.float)
data.shape

(203, 12600)

#### Import train_test_split function and Split dataset into training set and test set

In [32]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2,random_state=100) 

#### Find T score for every feature 

In [33]:
#classes
c1 = np.where(y_train=='AD ')[0]
c2 = np.where(y_train=='NL ')[0]
c3 = np.where(y_train=='SMCL ')[0]
c4 = np.where(y_train=='SQ ')[0]
c5 = np.where(y_train=='COID ')[0]

#mean of every class 
m1 = X_train[c1,:].mean(axis=0)
m2 = X_train[c2,:].mean(axis=0)
m3 = X_train[c3,:].mean(axis=0)
m4 = X_train[c4,:].mean(axis=0)
m5 = X_train[c5,:].mean(axis=0)
m = X_train.mean(axis=0)

#variance of every class 
v1 = X_train[c1,:].var(axis=0)*(c1.shape[0]-1)
v2 = X_train[c2,:].var(axis=0)*(c2.shape[0]-1)
v3 = X_train[c3,:].var(axis=0)*(c3.shape[0]-1)
v4 = X_train[c4,:].var(axis=0)*(c4.shape[0]-1)
v5 = X_train[c5,:].var(axis=0)*(c5.shape[0]-1)
s = np.sqrt((v1+v2+v3+v4+v5)/(samples-5))

t1 = np.abs(m1-m)/(np.sqrt((1/c1.shape[0])+(1/samples))*s)
t2 = np.abs(m2-m)/(np.sqrt((1/c2.shape[0])+(1/samples))*s)
t3 = np.abs(m3-m)/(np.sqrt((1/c3.shape[0])+(1/samples))*s)
t4 = np.abs(m4-m)/(np.sqrt((1/c4.shape[0])+(1/samples))*s)
t5 = np.abs(m5-m)/(np.sqrt((1/c5.shape[0])+(1/samples))*s)

tscore = np.maximum.reduce([t1,t2,t3,t4,t5])
tscore

array([9.83132247, 8.25870955, 8.79932101, ..., 5.10867731, 2.98815196,
       2.58307433])

#### Sort indexes in descending order of T score

In [34]:
sort_idxs=(-tscore).argsort()[:tscore.shape[0]]
sort_idxs

array([ 771,  307, 1354, ..., 5840, 6196,  330], dtype=int64)

#### Select top features (less than 20%)

In [35]:
filt_idxs = sort_idxs[0:int((data.shape[1])*0.2)]
filt_idxs

array([ 771,  307, 1354, ..., 4335, 4253, 3405], dtype=int64)

In [36]:
X_new_train = X_train[:,filt_idxs]
X_new_test = X_test[:,filt_idxs]
X_new_train.shape

(162, 2520)

#### KNN Classification

In [37]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_new_train, y_train)
y_pred = knn.predict(X_new_test)

#### Accuracy, F score and Confusion matrix using KNN

In [38]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.8048780487804879
f1 score: 0.6933333333333334
confusion matrix: [[26  0  0  0  0]
 [ 0  3  0  0  0]
 [ 0  0  1  0  0]
 [ 4  0  0  0  0]
 [ 4  0  0  0  3]]


#### SNM Classification

In [39]:
from sklearn import svm
clf = svm.SVC(kernel='linear')
clf.fit(X_new_train, y_train)
y_pred = clf.predict(X_new_test)

#### Accuracy, F score and Confusion matrix using SVM

In [40]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("f1 score:",metrics.f1_score(y_test, y_pred, average="macro"))
print("confusion matrix:",metrics.confusion_matrix(y_test, y_pred))

Accuracy: 0.8048780487804879
f1 score: 0.7142857142857143
confusion matrix: [[24  0  0  0  2]
 [ 0  3  0  0  0]
 [ 0  0  1  0  0]
 [ 4  0  0  0  0]
 [ 2  0  0  0  5]]
