In [59]:
# import binascii
# import os

# PATH = 'dataset/'
# types = os.listdir(PATH)

# for fType in types:
#     files = os.listdir(PATH + fType + '/')
#     for file in files:
#         with open(PATH + fType + '/' + file, 'rb') as f:
#             content = f.read()
#             file_array = binascii.hexlify(content)
#             print(file, file_array[:10])
#     print('-----------------------')

In [60]:
# prepare the training dataset
import binascii
import os

import torch
import torchvision
import torchvision.transforms as transforms

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable


import numpy as np
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm_notebook
ohe = OneHotEncoder()

PATH = 'dataset/'
SAVE_PATH = 'D:/Freelance big/Mohammed A/'
types = os.listdir(PATH)

CHARS = np.array(list(map(chr, list(np.arange(97, 122)))) + list(np.arange(0, 10)))
NUM_CHARS = len(CHARS)
LINE_LENGTH = 20
ohe.fit(CHARS.reshape(-1, 1))
BATCH_SIZE = 16

N_EPOCHS = 20

In [61]:
from time import time
def to_tensor(line, ohe, maxchars):
    '''
    One Hot Encodes the line of symbols and then transforms to torch tensor, to pass into the model
    :param line: Line of char symbols
    :param ohe: OneHotEncoder instance
    :param maxchars: Number of chars to take from the both sides of line
    :return: Torch Tensor of one hot encoded line
    '''
    
    return ohe.transform(line.reshape(-1, 1)).todense()

class Dataset(torch.utils.data.Dataset):
    '''
    Custom Dataset object for the CDiscount competition
        Parameters:
            root_dir - directory including category folders with images

        Example:
        images/
            1000001859/
                26_0.jpg
                26_1.jpg
                ...
            1000004141/
                ...
            ...
    '''
    
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.categories = sorted(os.listdir(root_dir))
        self.cat2idx = dict(zip(self.categories, range(len(self.categories))))
        self.idx2cat = dict(zip(self.cat2idx.values(), self.cat2idx.keys()))
        self.files = []
        cat_mapping = {}
        for (dirpath, dirnames, filenames) in os.walk(self.root_dir):
            for f in filenames:
                if 0 == 0:
                    o = {}
                    o['f_path'] = dirpath + '/' + f
                    o['category'] = self.cat2idx[dirpath[dirpath.find('/')+1:]]
                    cat_mapping[o['category']] = dirpath.split('/')[-1]
                    self.files.append(o)
        print(f'category mapping: {cat_mapping}')
    
    def __len__(self):
        return len(self.files)
    
    def __getitem__(self, idx):
        f_path = self.files[idx]['f_path']
        category = self.files[idx]['category']
        
        with open(f_path, 'rb') as f:
            content = binascii.hexlify(f.read())
            content = list(content[:LINE_LENGTH].decode("utf-8")) + list(content[-LINE_LENGTH:].decode("utf-8"))
            file_array = np.array(content)
            nToReplace = np.random.randint(0, 10+1)
            
            indexes = np.random.choice(np.arange(LINE_LENGTH), nToReplace, replace = False)
            file_array[indexes] = np.random.choice(CHARS, nToReplace)
            
            indexes = np.random.choice(np.arange(len(file_array) - LINE_LENGTH, len(file_array)), nToReplace, replace = False)
            file_array[indexes] = np.random.choice(CHARS, nToReplace)
            data = np.array(to_tensor(file_array, ohe, LINE_LENGTH)).reshape(1, 2*LINE_LENGTH, NUM_CHARS)
        
        return {'data': data, 'category': category}


# create instance of data class and pytorch dataloader
dataSet = Dataset(PATH)
dataloader = torch.utils.data.DataLoader(dataSet, batch_size=BATCH_SIZE, shuffle=True)
MAPPING = {0: 'bmp', 1: 'flv', 2: 'jpg', 3: 'mp3', 4: 'mp4', 5: 'pdf', 6: 'png', 7: 'wav'}
N_CLASSES = len(MAPPING)

category mapping: {0: 'bmp', 1: 'flv', 2: 'jpg', 3: 'mp3', 4: 'mp4', 5: 'pdf', 6: 'png', 7: 'wav'}


In [62]:
import torch.nn.functional as F
import torch.nn  as nn

class Net(nn.Module):
    '''
    Represents the structure of pytorch CNN model
    '''
    
    def __init__(self, N_CLASSES):
        '''
        Describes all layers, contained in the model
        :param N_CLASSES: Number of output classes of the model
        '''
        
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 3, 2)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(3, 8, 2)

        self.conv3 = nn.Conv2d(8, 16, 5)
        self.conv4 = nn.Conv2d(16,32, 5)

        self.dropout = nn.Dropout(0.3)

        self.fc1 = nn.Linear(32 * 5 * 4, 128)
        self.bnorm1 = nn.BatchNorm1d(128)

        self.fc2 = nn.Linear(128, 64)
        self.bnorm2 = nn.BatchNorm1d(64)

        self.fc3 = nn.Linear(64, N_CLASSES)

    def forward(self, x):
        '''
        Displays the connections between model layers for the forward pass
        :param x: input torch tensor
        :return: model prediction
        '''
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))

        x = F.relu(self.conv3(x))
        x = self.pool(F.relu(self.conv4(x)))
        x = x.view(-1, 32 * 5 * 4)
        x = self.dropout(x)
        x = F.relu(self.bnorm1(self.fc1(x)))
        x = F.relu(self.bnorm2(self.fc2(x)))
        x = self.fc3(x)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
net = Net(N_CLASSES).double()
net.to(device)

Net(
  (conv1): Conv2d(1, 3, kernel_size=(2, 2), stride=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(3, 8, kernel_size=(2, 2), stride=(1, 1))
  (conv3): Conv2d(8, 16, kernel_size=(5, 5), stride=(1, 1))
  (conv4): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
  (dropout): Dropout(p=0.3)
  (fc1): Linear(in_features=640, out_features=128, bias=True)
  (bnorm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (bnorm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=64, out_features=8, bias=True)
)

In [63]:
### Training loop for the model
criterion = nn.CrossEntropyLoss() # loss function
optimizer = optim.RMSprop(net.parameters(), lr=1e-3) # optimizer for the loss function

print('Started Training!')
net.train()
for epoch in range(N_EPOCHS):
    running_loss = 0.0
    examples = 0
    # for batch of the data perform forward pass and 
    # update the gradients 
    for i, data in enumerate(dataloader, 0):
        # Get the inputs
        inputs, labels = data['data'], data['category']
        
        # Wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        #print(outputs.shape)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.data
        examples += BATCH_SIZE
    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / examples))

print('Finished Training!')

Started Training!
[1,    14] loss: 0.125
[2,    14] loss: 0.111
[3,    14] loss: 0.079
[4,    14] loss: 0.062
[5,    14] loss: 0.055
[6,    14] loss: 0.051
[7,    14] loss: 0.045
[8,    14] loss: 0.036
[9,    14] loss: 0.035
[10,    14] loss: 0.034
[11,    14] loss: 0.026
[12,    14] loss: 0.026
[13,    14] loss: 0.024
[14,    14] loss: 0.021
[15,    14] loss: 0.023
[16,    14] loss: 0.024
[17,    14] loss: 0.018
[18,    14] loss: 0.016
[19,    14] loss: 0.019
[20,    14] loss: 0.018
Finished Training!


In [77]:
# Example prediction
net.eval()
path = 'dataset/bmp/'
for file in os.listdir(path):
    with open(path + file, 'rb') as f:
        content = binascii.hexlify(f.read())
        content = list(content[:LINE_LENGTH].decode("utf-8")) + list(content[-LINE_LENGTH:].decode("utf-8"))
        file_array = np.array(content)
        #file_array = binascii.hexlify(content)
        nToReplace = np.random.randint(0, 5+1)

        indexes = np.random.choice(np.arange(LINE_LENGTH), nToReplace, replace = False)
        file_array[indexes] = np.random.choice(CHARS, nToReplace)

        indexes = np.random.choice(np.arange(len(file_array) - LINE_LENGTH, len(file_array)), nToReplace, replace = False)
        file_array[indexes] = np.random.choice(CHARS, nToReplace)
        inp = torch.from_numpy(to_tensor(file_array, ohe, LINE_LENGTH)).reshape(1, 1, 2*LINE_LENGTH, NUM_CHARS).double().to(device)

        output = net(inp)
        print(MAPPING[np.argmax(output.cpu().detach().numpy())])

bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp
bmp


In [73]:
# Example prediction
net.eval()
path = 'TEST/'
for file in os.listdir(path):
    with open(path + file, 'rb') as f:
        content = binascii.hexlify(f.read())
        content = list(content[:LINE_LENGTH].decode("utf-8")) + list(content[-LINE_LENGTH:].decode("utf-8"))
        file_array = np.array(content)
        #file_array = binascii.hexlify(content)
        nToReplace = np.random.randint(0, 5+1)

        indexes = np.random.choice(np.arange(LINE_LENGTH), nToReplace, replace = False)
        file_array[indexes] = np.random.choice(CHARS, nToReplace)

        indexes = np.random.choice(np.arange(len(file_array) - LINE_LENGTH, len(file_array)), nToReplace, replace = False)
        file_array[indexes] = np.random.choice(CHARS, nToReplace)
        inp = torch.from_numpy(to_tensor(file_array, ohe, LINE_LENGTH)).reshape(1, 1, 2*LINE_LENGTH, NUM_CHARS).double().to(device)

        output = net(inp)
        print(file.split('.')[-1], MAPPING[np.argmax(output.cpu().detach().numpy())])

png png
jpg jpg
pdf pdf
bmp bmp
png png
jpg jpg
png png
wav wav
png png


In [68]:
# saving torch model
torch.save(net.state_dict(), SAVE_PATH + 'model.pt')