<a href="https://colab.research.google.com/github/ajj8866/facebook_mkt/blob/main/fb_train_gdrivev2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files, drive
from pathlib import Path
import os
drive.mount('/content/drive/') #Mounting drive folder to contain the folder facebook_mkt
print('Current working directory after mounting: ',os.getcwd())
print('Home directory after mounting: ', Path.home())
os.chdir(Path(Path.cwd(), 'drive','MyDrive', 'facebook_mkt'))
print('Absolute path to facebook_mkt directory: ', os.path.abspath(Path.cwd()))



Mounted at /content/drive/
Current working directory after mounting:  /content
Home directory after mounting:  /root
Absolute path to facebook_mkt directory:  /content/drive/MyDrive/facebook_mkt


In [2]:
!cp 'images.zip'
!unzip -q images.zip
!rm images.zip

cp: missing destination file operand after 'images.zip'
Try 'cp --help' for more information.
replace __MACOSX/._images? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [3]:
!pip install torchbearer
!pip install XlsxWriter
print(os.getcwd())
print(os.listdir())

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchbearer
  Downloading torchbearer-0.5.3-py3-none-any.whl (138 kB)
[K     |████████████████████████████████| 138 kB 12.3 MB/s 
Installing collected packages: torchbearer
Successfully installed torchbearer-0.5.3
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting XlsxWriter
  Downloading XlsxWriter-3.0.3-py3-none-any.whl (149 kB)
[K     |████████████████████████████████| 149 kB 29.7 MB/s 
[?25hInstalling collected packages: XlsxWriter
Successfully installed XlsxWriter-3.0.3
/content/drive/MyDrive/facebook_mkt
['torch_fb_run.ipynb', 'data_files', '__MACOSX', 'runs', 'images']


In [None]:
%reload_ext tensorboard
%tensorboard --logdir='/content/drive/MyDrive/facebook_mkt/runs'


import pandas as pd
import os
import numpy as np
from matplotlib.gridspec import GridSpec
from itertools import product
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader, random_split
import json 
import torchvision.transforms as transforms
import re
from PIL import Image
import multiprocessing
import torchvision
from skimage import io
from skimage import img_as_float
from skimage.filters import sobel
from skimage.color import rgb2gray
from sklearn.preprocessing import LabelEncoder
from torchbearer import Trial
from torch.utils.tensorboard import SummaryWriter
from torch.optim import lr_scheduler
from torchvision.transforms import Normalize, ToPILImage, ToTensor
from torchbearer.callbacks import TensorBoard
from torch.nn import Module
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from torch import nn
import torch.optim as optim
from pathlib import Path
from torchvision import models, datasets
import copy
import time
from tensorboard import notebook

plt.ion()

class CleanData:
    def __init__(self, tab_names = ['Products']) -> None:
        self.tab_names = tab_names
        maj_unique_cats = ['Home & Garden ', 'Baby & Kids Stuff ', 'DIY Tools & Materials ', 'Music, Films, Books & Games ', 'Phones, Mobile Phones & Telecoms ', 'Clothes, Footwear & Accessories ', 'Other Goods ', 'Health & Beauty ', 'Sports, Leisure & Travel ', 'Appliances ', 'Computers & Software ','Office Furniture & Equipment ', 'Video Games & Consoles ']
        self.major_map_decoder = dict(enumerate(maj_unique_cats))
        self.major_map_encoder = {val: key for key, val in self.major_map_decoder.items()}
        if 'data_files' not in os.listdir():
            os.mkdir(Path(Path.cwd(), 'data_files'))
        self.table_dict = {}
        for table in tab_names:
            self.table_dict[table] = pd.read_json(Path(Path.cwd(),'data_files', table+'.json'))
            self.table_dict[table].dropna(inplace = True)
            if 'price' in self.table_dict[table].columns:
                self.table_dict[table]['price'] = self.table_dict[table][self.table_dict[table]['price'] != 'N/A'.strip()]['price']
                self.table_dict[table]['price'] = self.table_dict[table]['price'].str.replace(',', '').str.strip('£').str.strip(' ').astype(np.float32)
                self.table_dict[table] = self.table_dict[table][np.round(self.table_dict[table]['price']) != 0]
            if 'category' in self.table_dict[table].columns:
                self.expand_category(df=table)

    
    def try_merge(self, df_list):
        '''
        Combines dataframes passed in into a single dataframe

        Parameters:
        df_list: Must contain dataframes within self.table_dict passed in as a list
        '''
        if isinstance(self.tab_names, str):
            print('Method not valid when class instantiated with tab_names as type string')
        else:
            self.new_df = pd.DataFrame(columns = self.table_dict[df_list[0]].columns)
            for i in df_list:
                self.new_df = pd.concat([self.new_df, self.table_dict[i]], axis=0)
        self.table_dict['combined'] = self.new_df
        self.table_dict['combined'].dropna(inplace=True)
        return self.table_dict['combined']
    
    def get_na_vals(self, df):
        print(f'The following NA values exist if dataframe {df}')
        return self.table_dict[df][self.table_dict[df].isna().any(axis=1)]

    def __repr__(self) -> str:
        if isinstance(self.tab_names, str):
            print(self.df.columns)
            print('\nTable Name: ', self.tab_names, 'With columns:')
            return ' | '.join(self.df.columns)
        else:
            print('\n')
            print('Total of ', f'{len(self.table_dict)} tables')
            return '\n'.join([f'Table Name: {i}: \n' f'Columns | {" | ".join(j.columns)} \n' for i, j in self.table_dict.items()])

    def to_excel(self):
        for i, j in self.table_dict.items():
            ex_writer = pd.ExcelWriter(f'data_files/{i}.xlsx', engine='xlsxwriter')
            with ex_writer as writer:
                j.to_excel(writer, sheet_name=i)
    
    def cat_set(self, df = 'Products',cat_col = 'major_category'):
        return self.table_dict[df][cat_col].nunique()
    
    def expand_category(self, df = 'Products'):
        self.major_encoder = LabelEncoder()
        self.minor_encoder = LabelEncoder()
        self.table_dict[df]['major_category'] = self.table_dict[df]['category'].str.split('/').apply(lambda i: i[0])
        self.table_dict[df]['minor_category'] = self.table_dict[df]['category'].str.split('/').apply(lambda i: i[1])
        self.table_dict[df] = self.table_dict[df][self.table_dict[df]['major_category'] != 'N'.strip()]
        self.table_dict[df]['major_category_encoded'] = self.table_dict[df]['major_category'].map(self.major_map_encoder)
        self.table_dict[df]['minor_category_encoded'] = self.minor_encoder.fit_transform(self.table_dict[df]['minor_category'])
        return self.table_dict[df]
    
    def inverse_transform(self, input_array, major_minor = 'minor'):
        category_dict = {'major': self.major_encoder, 'minor': self.minor_encoder}
        try:
            return category_dict[major_minor].inverse_transform(input_array)
        except TypeError:
            return category_dict[major_minor].inverse_transform(input_array.numpy())
    
    
    def sum_by_cat(self, df= 'Products', quant = 0.95):
        data = self.expand_category(df)
        major = data.groupby('major_category')['price'].describe()
        print('Price Statistics Grouped by Major Category')
        print(major)
        major_cat_list = major.index.tolist()
        #sns.boxplot(data=data, x = 'major_category', y = 'price')
        products_df = data.loc[:, ['major_category', 'minor_category', 'price']]
        for i in major_cat_list:
            prod_plot = products_df.loc[products_df['major_category'] == i]
            # print(prod_plot['price'].quantile([quant]))
            # print(type(prod_plot['price'].quantile([quant][0])))
            # print('Number of observations with price more than the 99th quantile: ', len(prod_plot[prod_plot['price'] > prod_plot['price'].quantile([quant][0])]))
            # sns.boxplot(data=prod_plot, x='major_category', y='price')
            # plt.show()
            sns.boxplot(data=prod_plot[prod_plot['price']<prod_plot['price'].quantile([quant][0])], x = 'major_category', y = 'price')
            plt.show()

    def trim_data(self, df= 'Products', quant = 0.95):
        self.table_dict[df] = self.table_dict[df][self.table_dict[df]['price'] > self.table_dict[df]['price'].quantile([quant])]
        return self.table_dict[df]

    @classmethod
    def allTables(cls):
        json_list = []
        json_regex = re.compile(r'(.*).json$')
        for i in os.listdir(Path(Path.cwd(), 'data_files')):
            if re.search(json_regex, i) is not None:
                json_list.append(re.search(json_regex, i).group(1))
        print(json_list)
        return cls(tab_names = json_list)

#############################################################################################

class CleanImages(CleanData):
    def __init__(self, tab_names=['Images']) -> None:
        super().__init__(tab_names)
        self.df = self.table_dict[tab_names[0]].copy()
        self.csv_df = None

    def img_clean_pil(self, size = 512, mode = 'RGB'):
        image_re = re.compile(r'(.*)\.jpg')
        os.chdir('/content/drive/MyDrive/facebook_mkt/images') #
        # os.chdir(Path(Path.cwd(), 'images'))
        t = 0
        for i in os.listdir():
            if re.findall(image_re, i) != []:
                try:
                    temp_image = Image.open(i)
                    black_back = Image.new(size=(size, size), mode=temp_image.mode) #, mode=mode
                    curr_size = temp_image.size
                    max_dim = max(temp_image.size)
                    scale_fact = size / max_dim
                    resized_image_dim = (int(scale_fact*curr_size[0]), int(scale_fact*curr_size[1]))
                    updated_image = temp_image.resize(resized_image_dim)
                    black_back.paste(updated_image, ((size- resized_image_dim[0])//2, (size- resized_image_dim[1])//2))
                    black_back = black_back.convert(mode)
                    t += 1
                    black_back.save(i)
                except Exception:
                    print(i)
                    with open('invalid_file.json', 'w') as wrong_form:
                        json.dump(i, wrong_form)
                    os.remove(i)
                    pass
        print(t)
        os.chdir('/content/drive/MyDrive/facebook_mkt')

    def img_clean_sk(self, normalize = False):
        image_re = re.compile(r'(.*)\.jpg')
        img = []
        img_dim_list = []
        img_id = []
        image_array = []
        img_channels = []
        img_num_features = []
        img_mode = []
        os.chdir(Path(Path.cwd(), 'images'))
        for im in os.listdir():
            if re.findall(image_re, im) != []:
                img.append(im)
                image = io.imread(im)
                if normalize == True:
                    image = img_as_float(image)
                img_id.append(re.search(image_re, im).group(1))
                image_array.append(image)
                img_dim_list.append(image.shape)
                if len(image.shape) == 3:
                    img_num_features.append(image.shape[2])
                else:
                    img_num_features.append(1)
                img_channels.append(len(image.shape))
                img_mode.append(Image.open(im).mode)
        os.chdir(Path(Path.cwd().parents[0]))
        self.image_frame = pd.DataFrame(data={'image_id': img_id, 'image': img,'image_array': image_array,'image_shape': img_dim_list, 'mode': img_mode})
        return self.image_frame
    
    def to_excel(self, df):
        df.to_excel(Path(Path.cwd(), 'data_files','Cleaned_Images.xlsx'), sheet_name = 'images')

    def merge_images(self):
        self.df.rename({'id': 'image_id', 'product_id': 'id'}, axis=1, inplace=True)
        self.final_df = self.image_frame.merge(self.df, on='image_id', how='inner', validate='one_to_many')
        #print(self.final_df.head())
        return self.final_df
    
    def edge_detect(self):
        try:
            self.image_frame['edge_array'] = self.image_frame['image_array'].copy().apply(lambda i: sobel(rgb2gray(i)))
        except: 
            self.image_frame['edge_array'] = self.image_frame['image_array'].copy().apply(lambda i: sobel(i))
        return self.image_frame


    def total_clean(self, normalize=False, mode = 'RGB', size = 224):
        self.img_clean_pil(mode=mode, size=size)
        self.img_clean_sk(normalize=normalize)
        self.edge_detect()
        self.merge_images()
        return self.final_df
    
    def show_random_images(self, col, size, fig_height= 15, fig_width=10):
        grid = GridSpec(nrows = size, ncols = size)
        fig = plt.figure(figsize=(fig_height, fig_width))
        for i, j in product(range(size), range(size)):
            fig.add_subplot(grid[i, j]).imshow(self.final_df[col].iloc[np.random.randint(low=0, high=len(self.final_df)-1)])
        plt.show()

    def describe_data(self, df):
        print('\n')
        print('Data frame columnn information')
        print(df.info())
        print('\n')
        print('#'*20)
        print('Dataframe statistical metrics')
        #print(df.describe())
        print('#'*20)
        print('Array and shape')
        print(df['image_shape'].unique())
        print(df['image_shape'].value_counts())

#############################################################################################

class MergedData:
    def __init__(self):
        img_class = CleanImages()
        prod_class = CleanData(tab_names=['Products'])
        self.major_map_encoder = prod_class.major_map_encoder
        self.major_map_decoder = prod_class.major_map_decoder
        self.prod_frame = prod_class.table_dict['Products'].copy()
        self.img_df = img_class.total_clean()
        self.merged_frame = self.img_df.merge(self.prod_frame, left_on='id', right_on='id')
    
    def to_pickle(self):
        self.merged_frame.to_pickle(Path(Path.cwd(), 'merged_data.pkl'))
    
    def get_val_counts(self):
        return {'products': self.prod_frame, 'images': self.img_df, 'all': self.merged_frame}
      
#############################################################################################

class Dataset(torch.utils.data.Dataset):
    def __init__(self, transformer = transforms.Compose([ToTensor()]), X = 'image_array', y = 'major_category_encoded', img_dir = Path(Path.cwd(), 'images'), img_size=224, train_proportion = 0.8, is_test = False):
        '''
        X: Can be either 'image' if dataset to be instantiated using image object or 'image_array' if dataset to be instantiated using numpy array 
        y: Can be either 'major_category_encoded' or 'minor_category_encoded'
        '''
        self.img_inp_type = X
        self.transformer = transformer
        self.img_dir = img_dir
        self.img_size = img_size
        merge_class = MergedData()
        merged_df = merge_class.merged_frame
        filtered_df = merged_df.loc[:, ['image_id', X, re.sub(re.compile('_encoded$'), '', y), y]].copy()
        filtered_df.dropna(inplace=True)
        print(filtered_df[y].value_counts())
        print(filtered_df[re.sub(re.compile('_encoded$'), '', y)].value_counts())
        train_end = int(len(filtered_df)*train_proportion)
        if is_test == False:
            filtered_df = filtered_df.iloc[:train_end]
        elif is_test == True:
            filtered_df = filtered_df.iloc[train_end:]
        else:
            pass
        self.dataset_size = len(filtered_df)
        self.all_data = filtered_df
        print('Total observations in remaining dataset: ', len(filtered_df))
        self.y = torch.tensor(filtered_df[y].values)
        self.X = filtered_df[X].values

    def __getitem__(self, idx): 
        if self.img_inp_type == 'image':
            try:
                self.X[idx] =  Image.open(os.path.join(self.img_dir, self.X[idx]))
                if self.transformer is not None:
                    self.X[idx] = self.transformer(self.X[idx])
            except TypeError:
                self.X[idx] = self.X[idx]
        elif self.img_inp_type == 'image_array':
            try:
                # self.X[idx] = torch.from_numpy(np.transpose(self.X[idx], (2,1,0)))
                if self.transformer is not None:
                    self.X[idx] = self.transformer(self.X[idx])
            except TypeError:
                self.X[idx] = self.X[idx]
        else:
            self.X[idx] = self.X[idx]        
        return self.X[idx], self.y[idx]

    def __len__(self):
        return len(self.y)
#############################################################################################

pd.set_option('display.max_colwidth', 400)
pd.set_option('display.max_columns', 15)
pd.set_option('display.max_rows', 40)
plt.rc('axes', titlesize=12)

res_model = models.resnet50(pretrained=True)
for param in res_model.parameters():
    param.requires_grad = False
res_model.fc = nn.Sequential(nn.Linear(in_features=2048, out_features=512, bias=True), nn.ReLU(inplace=True), nn.Dropout(p=0.2), nn.Linear(in_features=512, out_features=32), nn.Linear(in_features=32, out_features=13))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
res_model.to(device)

opt = optim.Adam
optimizer =  opt(res_model.parameters(), lr=0.2)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=[5, 10, 15, 20, 25, 30], gamma=0.1) 
criterion = nn.CrossEntropyLoss()


def get_loader(img = 'image_array',batch_size=35, split_in_dataset = False, train_prop = 0.8):
    train_transformer = transforms.Compose([transforms.RandomRotation(40), transforms.RandomHorizontalFlip(p=0.5), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    test_transformer = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])])
    if split_in_dataset == True:
        train_dataset = Dataset(transformer=train_transformer, X=img, img_size=224, is_test=False, train_proportion=train_prop)
        test_dataset = Dataset(transformer=test_transformer, X=img, img_size=224, is_test=True, train_proportion=train_prop)
        train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
        test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)
        data_loader_dict = {'train': train_loader, 'eval': test_loader}
        return train_dataset.dataset_size, test_dataset.dataset_size, data_loader_dict
    else:
        image_datsets= Dataset(transformer=test_transformer, X = img, img_size=224, is_test=None)
        train_end = int(train_prop*image_datsets.dataset_size)
        train_dataset, test_dataset = random_split(image_datsets, lengths=[len(image_datsets.all_data.iloc[:train_end]), len(image_datsets.all_data.iloc[train_end:])])
        dataset_dict = {'train': train_dataset, 'eval': test_dataset}
        data_loader_dict = {i: DataLoader(dataset_dict[i], batch_size=batch_size, shuffle=True) for i in ['train', 'eval']}
        return len(image_datsets.all_data.iloc[:train_end]), len(image_datsets.all_data.iloc[train_end:]), data_loader_dict
    
prod_dum = CleanData()
class_dict = prod_dum.major_map_encoder.keys()
classes = list(class_dict)
class_values = prod_dum.major_map_encoder.values()
class_encoder = prod_dum.major_map_encoder


'''Tensorboard Function for Showing Images'''
def show_image(input_ten_orig):
    input_ten = torch.clone(input_ten_orig)
    inv_normalize_array = transforms.Normalize(mean=[-0.485/0.229, -0.456/0.224, -0.406/0.255], std=[1/0.229, 1/0.224, 1/0.255])
    inv_normalize = transforms.Compose([inv_normalize_array])
    input_ten = inv_normalize(input_ten)
    input_numpy = input_ten.numpy()
    plt.imshow(np.transpose(input_numpy, (1, 2, 0)))
    # plt.show()

'''Function for comparing actual images to predicted images in Tensorboard'''
def images_to_proba(input_arr, model = res_model): #Stub function used in plot_classes_preds to 
    input_tensor = torch.clone(input_arr)
    output = model(input_tensor)
    _, predicted_tensor = torch.max(output, 1)
    preds = np.squeeze(predicted_tensor.cpu().numpy())
    return preds, [F.softmax(out, dim=0)[pred_val].item() for pred_val, out in zip(preds, output)]

def plot_classes_preds(input_arr, lab, model = res_model):
    preds, proba = images_to_proba(input_arr, model)
    print(preds)
    print(proba)
    fig = plt.figure(figsize=(12, 12))
    for i in range(4):
        ax = fig.add_subplot(1, 4, i+1, xticks=[], yticks=[])
        show_image(input_arr[i])
        ax.set_title('{0}, {1:.1f}%\n(label: {2})'.format(classes[preds[i]], proba[i]*100, classes[lab[i]]), color=('green' if preds[i]==lab[i].item() else 'red')) #
        plt.tight_layout()
    return fig


'Model training and testing function'


def train_model(model=res_model, optimizer=optimizer, loss_type = criterion, num_epochs = 50, mode_scheduler = scheduler, batch_size = 32, image_type='image_array', split_in_datset=False):
    best_model_weights = copy.deepcopy(model.state_dict()) #May be changed at end of each "for phase block"
    best_accuracy = 0 # May be changed at end of each "for phase block"
    start = time.time()
    writer = SummaryWriter()
    train_size, test_size, data_loader_dict = get_loader(batch_size=batch_size, img=image_type, split_in_dataset=split_in_datset)
    dataset_size = {'train': train_size, 'eval': test_size}

    for epoch in range(num_epochs):
        print('\n')
        print('#'*20)
        print('Epoch Number: ', epoch)
        for phase in ['train', 'eval']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            
            running_loss = 0
            running_corrects = 0

            for batch_num, (inputs, labels) in enumerate(data_loader_dict[phase], start=1):
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad() # Gradients reset to zero at beginning of both training and evaluation phase

                with torch.set_grad_enabled(phase == 'train'):
                    # print(inputs)
                    # print(inputs.size())
                    outputs = model(inputs)
                    #outputs = torch.softmax(outputs, dim=1)
                    preds = torch.argmax(outputs, dim=1)
                    loss = loss_type(outputs, labels)
                    if phase == 'train':
                        loss.backward() #Calculates gradients
                        optimizer.step()

                if batch_num%100==0:
                    '''Writer functions for batch'''
                    #writer.add_figure('Predictions vs Actual',plot_classes_preds(input_arr=inputs, lab=labels, model=model))
                    writer.add_scalar(f'Accuracy for phase {phase} by batch number', preds.eq(labels).sum()/batch_size, batch_num)
                    writer.add_scalar(f'Average loss for phase {phase} by batch number', loss.item(), batch_num)

                running_corrects = running_corrects + preds.eq(labels).sum()
                running_loss = running_loss + (loss.item()*inputs.size(0))

            if (phase=='train') and (mode_scheduler is not None):
                mode_scheduler.step()

            '''Writer functions for epoch'''
            epoch_loss = running_loss / dataset_size[phase]
            print(f'Size of dataset for phase {phase}', dataset_size[phase])
            epoch_acc = running_corrects / dataset_size[phase]
            writer.add_scalar(f'Accuracy by epoch phase {phase}', epoch_acc, epoch)
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            writer.add_scalar(f'Average loss by epoch phase {phase}', epoch_loss, epoch)
            writer.flush()

            if phase == 'eval' and epoch_acc > best_accuracy:
                best_accuracy = epoch_acc
                best_model_weights = copy.deepcopy(model.state_dict())
                print(f'Best val Acc: {best_accuracy:.4f}')


    model.load_state_dict(best_model_weights)
    torch.save(model.state_dict(), 'image_model.pt')
    time_diff = time.time()-start
    print(f'Time taken for model to run: {(time_diff//60)} minutes and {(time_diff%60):.0f} seconds')
    return model

model_tr = train_model()
%reload_ext tensorboard


<IPython.core.display.Javascript object>

12668


In [6]:

%reload_ext tensorboard
%tensorboard --logdir='/content/drive/MyDrive/facebook_mkt/runs'


Reusing TensorBoard on port 6006 (pid 473), started 0:30:31 ago. (Use '!kill 473' to kill it.)

<IPython.core.display.Javascript object>