# Cancer Detection

In [1]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import numpy as np

# Data loading

In [32]:
df=pd.read_csv('trainset.csv')


In [3]:
validation=pd.read_csv('testset.csv')

# Data cleaning

In [33]:
todrop=df.isna().sum()

##### Number of features with NaN values

In [43]:
td=todrop.sort_values(ascending=False)
len(td[td.gt(0)])

580

#### Number of features with only NaN values

In [48]:
len(td[td==len(df)])

90

Request info on why ABC, ABCGG,nAcid,nBase are 0

In [5]:
for col in df.columns:
    #print(df[col].unique())
    if df[col].isna().sum()>500:
        df=df.drop(col,axis=1)

In [6]:
df=df.dropna()

In [7]:
df.shape

(7140, 1428)

In [9]:
y=df.cls.values
#x=df.drop('cls',axis=1).drop('ABC',axis=1).drop('ABCGG',axis=1).drop('n5FAHRing',axis=1).values
x=df.drop('cls',axis=1).values

# Class definitions

In [8]:
import torch
import torch.nn as nn

### Dataset class to create dataloaders which work well with pytorch 

In [10]:
from torch.utils.data import Dataset, DataLoader
class dtset(Dataset):
    
    # Init
    def __init__(self,data,labels):
        self.data=torch.FloatTensor(data)
        self.labels=torch.FloatTensor(labels)
    
    # len
    def __len__(self):
        return len(self.data)
    
    # obtain item
    def __getitem__(self,idx):
        return self.data[idx],self.labels[idx]

### The autoencoder to reduce dimensionality of the data further

In [11]:
# Autoencoder to reduce dimensionality
class Autoencoder(nn.Module):
    
    def __init__(self, input_size, encoding_dim):
        
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(input_size,int(input_size/2)),
            nn.ReLU(),
            nn.Linear(int(input_size/2),int(input_size/4)),
            nn.ReLU(),
            nn.Linear(int(input_size/4),int(input_size/8)),
            nn.ReLU(),
            nn.Linear(int(input_size/8),encoding_dim)
        )
        
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim,int(input_size/8)),
            nn.ReLU(),
            nn.Linear(int(input_size/8),int(input_size/4)),
            nn.ReLU(),
            nn.Linear(int(input_size/4),int(input_size/2)),
            nn.ReLU(),
            nn.Linear(int(input_size/2),input_size)
        )

    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        
        return encoded, decoded

### The CNN which will be our required model

In [12]:
class cnn(nn.Module):
    
    def __init__(self,input_size,num_classes=2,drop_per=0.03):
        
        super(cnn,self).__init__()
        
        # 1st convolution
        self.conv1d1 = nn.Conv1d(in_channels=input_size,out_channels=200,kernel_size=3)
        self.relu1 = nn.ReLU()
        self.maxpool1 = nn.MaxPool1d(kernel_size=3)
        self.dropout1 = nn.Dropout(p=drop_per)
        
        # 2nd convolution 
        self.conv1d2 = nn.Conv1d(in_channels=200,out_channels=150,kernel_size=3)
        self.relu2 = nn.ReLU()
        self.maxpool2 = nn.MaxPool1d(kernel_size=3)
        self.dropout2 = nn.Dropout(p=drop_per)
        
        # 3rd convolution
        self.conv1d3 = nn.Conv1d(in_channels=150,out_channels=100,kernel_size=3)
        self.relu3 = nn.ReLU()
        self.maxpool3 = nn.MaxPool1d(kernel_size=3)
        self.dropout3 = nn.Dropout(p=drop_per)
        
        # 4th convolution
        self.conv1d4 = nn.Conv1d(in_channels=100,out_channels=50,kernel_size=3)
        self.relu4 = nn.ReLU()
        self.maxpool4 = nn.MaxPool1d(kernel_size=3)
        self.dropout4 = nn.Dropout(p=drop_per)
        
        # 5th convolution
        self.conv1d5 = nn.Conv1d(in_channels=50,out_channels=1,kernel_size=3)
        self.sigmoid1 = nn.Sigmoid()
    
    def forward(self, x):

        # 1st convolution
        x = self.conv1d1(x)
        x = self.relu1(x)
        x = self.maxpool1(x)
        x = self.dropout1(x)

        # 2nd convolution
        x = self.conv1d2(x)
        x = self.relu2(x)
        x = self.maxpool2(x)
        x = self.dropout2(x)

        # 3rd convolution
        x = self.conv1d3(x)
        x = self.relu3(x)
        x = self.maxpool3(x)
        x = self.dropout3(x)

        # 4th convolution
        x = self.conv1d4(x)
        x = self.relu4(x)
        x = self.maxpool4(x)
        x = self.dropout4(x)

        # 5th convolution
        x = self.conv1d5(x)
        x = self.sigmoid1(x)

        # Flatten the output
        x = x.view(x.size(0), -1)

        return x


In [13]:
size=x.shape[1]

In [14]:
size

1427

# Data preprocessing

In [15]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
x_norm=scaler.fit_transform(x)
y_norm=scaler.fit_transform(y.reshape(-1,1))#.reshape(1,-1)

In [16]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_norm,y_norm,test_size=0.2,random_state=69)

In [17]:
batch_size=64

# create dataset objects
train=dtset(x_train,y_train)
test=dtset(x_test,y_test)

# create loaders
train_loader=DataLoader(train,batch_size=batch_size,shuffle=True)
test_loader=DataLoader(test,batch_size=batch_size)

# Getting shapes

In [18]:
x_train.shape,y_train.shape

((5712, 1427), (5712, 1))

In [20]:
x_test.shape,y_test.shape

((1428, 1427), (1428, 1))

In [19]:
#data=pd.DataFrame([i for i in train_loader])


In [21]:
import torch.optim as optim

encoding_dims=batch_size
# Init autoencoder
ae=Autoencoder(input_size=size,encoding_dim=encoding_dims)

# Init autoencoder loss func MSE
ae_loss_func=nn.MSELoss()

# Init autencoder optimzer adam
ae_optimizer=optim.Adam(ae.parameters(),lr=0.001 , eps=1e-6)

In [22]:
# Init model
model=cnn(input_size=encoding_dims, num_classes=2, drop_per=0.03)

# Init loss func binary-crossentropy
loss_func=nn.BCELoss()

# Init optimizer adam
optimizer=optim.Adam(model.parameters(),lr=0.001, eps = 1e-6)

# Train Autoencoder

In [23]:
ae_epochs=20

# Run for epochs
for epoch in range(ae_epochs):
    # Run for items in loader
    for data in train_loader:
        inputs,_=data # ignore labels
        ae_optimizer.zero_grad()
        encoded,decoded=ae(inputs)
        ae_loss=ae_loss_func(decoded,inputs)
        ae_loss.backward()
        ae_optimizer.step()
    print(f'Epoch [{epoch + 1}/{ae_epochs}], Loss: {ae_loss.item():.4f}')

  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


Epoch [1/20], Loss: 0.5608
Epoch [2/20], Loss: 0.3719
Epoch [3/20], Loss: 0.3899
Epoch [4/20], Loss: 0.3637
Epoch [5/20], Loss: 0.4014
Epoch [6/20], Loss: 0.4159
Epoch [7/20], Loss: 0.3194
Epoch [8/20], Loss: 0.3345
Epoch [9/20], Loss: 0.2679
Epoch [10/20], Loss: 0.2372
Epoch [11/20], Loss: 0.1847
Epoch [12/20], Loss: 0.2850
Epoch [13/20], Loss: 0.1880
Epoch [14/20], Loss: 0.2483
Epoch [15/20], Loss: 0.1983
Epoch [16/20], Loss: 0.1329
Epoch [17/20], Loss: 0.1451
Epoch [18/20], Loss: 0.1952
Epoch [19/20], Loss: 0.1359
Epoch [20/20], Loss: 0.2124


# Train CNN

In [24]:

epochs=10

# Run for epochs
for epoch in range(epochs):
    # Run for items in loader
    for inputs,labels in train_loader:
        optimizer.zero_grad()
        outputs=model(inputs)
        loss=loss_func(outputs,labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}')
       

ValueError: Using a target size (torch.Size([64, 1])) that is different to the input size (torch.Size([1, 14])) is deprecated. Please ensure they have the same size.

# Save

In [None]:
torch.save(ae.state_dict(),'models/autoencoder.pth')
torch.save(model.state_dict(),'models/model.pth')

# Load

In [None]:
ae.load_state_dict(torch.load('models/autoencoder.pth'))
model.load_state_dict(torch.load('models/model.pth'))