# Cross-Validation Pipeline

In [1]:
import os
import numpy as np
import shutil
import codecs

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder

## Merge the new dataset with the current one

In [2]:
new_dataset_dir = 'more_meme_characters2/'
cur_dataset_dir = 'many_meme_characters/'

In [None]:
for dirpath, dirnames, filenames in os.walk(cur_dataset_dir):
    if not dirnames:
        meme = os.path.basename(dirpath)
        new_meme_dir = os.path.join(new_dataset_dir, meme)
        print('working with', meme, '...')
        cond = not os.path.exists(new_meme_dir) or\
               len(os.listdir(new_meme_dir)) < 3
        if cond:
            print(meme, 'is incomplete in', new_meme_dir)
            for f in filenames:
                shutil.copy(os.path.join(dirpath, f),
                            os.path.join(new_meme_dir, f))
        else:
            print(meme, 'is already found in', new_meme_dir)
            for f in filenames:
                if f.endswith('csv') and '_metadata' not in f:
                    num_lines_cur = sum(1 for line in codecs.open(os.path.join(dirpath, f)))
                    num_lines_new = sum(1 for line in codecs.open(os.path.join(new_meme_dir, f)))
                    if num_lines_cur > num_lines_new:
                        print('adjusting', meme, 'caption number...')
                        shutil.copy(os.path.join(dirpath, f),
                                    os.path.join(new_meme_dir, f))

In [3]:
for dirpath, dirnames, filenames in os.walk(new_dataset_dir):
    if not dirnames:
        cond = not filenames or\
               (len(filenames) <= 1 and\
               not filenames[0].endswith('csv'))
        if cond:
            print(filenames, dirpath)
            shutil.rmtree(dirpath)

['gildarts.jpg'] more_meme_characters2/gildarts
['matt-(mail-jeevas).jpg'] more_meme_characters2/matt-(mail-jeevas)
['sloth-goonies.jpg'] more_meme_characters2/sloth-goonies
['scumbag-(lamar)-smith1.jpg'] more_meme_characters2/scumbag-(lamar)-smith1
['canonical-yesenin.jpg'] more_meme_characters2/canonical-yesenin
['valentin-strikalo.jpg'] more_meme_characters2/valentin-strikalo
['nfbgus.jpg'] more_meme_characters2/nfbgus
['krutaya-devchyoncka.jpg'] more_meme_characters2/krutaya-devchyoncka
['usachino.jpg'] more_meme_characters2/usachino
['compton12931231.jpg'] more_meme_characters2/compton12931231
['dr.-steve-brule.jpg'] more_meme_characters2/dr.-steve-brule
['sexy_taxy.jpg'] more_meme_characters2/sexy_taxy
['good-guy-benedict.jpg'] more_meme_characters2/good-guy-benedict
['typical-edross.jpg'] more_meme_characters2/typical-edross
['officer_head.jpg'] more_meme_characters2/officer_head
['ct-meme.jpg'] more_meme_characters2/ct-meme
['tupu4niu_beach.jpg'] more_meme_characters2/tupu4niu_

['stoned_man.jpg'] more_meme_characters2/stoned_man
['rileyy_69.jpg'] more_meme_characters2/rileyy_69
['if-it-flies,-it-dies.jpg'] more_meme_characters2/if-it-flies,-it-dies
['nadym2.jpg'] more_meme_characters2/nadym2
['zadrotcimputer.jpg'] more_meme_characters2/zadrotcimputer
['typical-olympiad.jpg'] more_meme_characters2/typical-olympiad
['derzki-alex.jpg'] more_meme_characters2/derzki-alex
['rick-scott-dreams....jpg'] more_meme_characters2/rick-scott-dreams...
['typical-bauman-university.jpg'] more_meme_characters2/typical-bauman-university
['rogue-doper.jpg'] more_meme_characters2/rogue-doper
['mrs.-s.jpg'] more_meme_characters2/mrs.-s
['jimmy-(pulp-fiction).jpg'] more_meme_characters2/jimmy-(pulp-fiction)
['brash-punk.jpg'] more_meme_characters2/brash-punk
['typical_top.jpg'] more_meme_characters2/typical_top
['gayhaas.jpg'] more_meme_characters2/gayhaas
['n-peredelkino.jpg'] more_meme_characters2/n-peredelkino
['mopsina.jpg'] more_meme_characters2/mopsina
['arshavin666.jpg'] more

["i'll-be-bach.jpg"] more_meme_characters2/i'll-be-bach
['sinweloveyou.jpg'] more_meme_characters2/sinweloveyou
['volodyaface.jpg'] more_meme_characters2/volodyaface
['d.gomez-jack0.jpg'] more_meme_characters2/d.gomez-jack0
['pioner.jpg'] more_meme_characters2/pioner
['fizteh.type.jpg'] more_meme_characters2/fizteh.type
['typical_zlatoust.jpg'] more_meme_characters2/typical_zlatoust
['dr.-evil-quote.jpg'] more_meme_characters2/dr.-evil-quote
['drew-pickles:-the-gayest-man-in-the-world.jpg'] more_meme_characters2/drew-pickles:-the-gayest-man-in-the-world
['typical-school-93.jpg'] more_meme_characters2/typical-school-93
['jj_sez01.jpg'] more_meme_characters2/jj_sez01
[] more_meme_characters2/typical-fans-"the-x-files"
['this-is-stas.jpg'] more_meme_characters2/this-is-stas
['lesteh.jpg'] more_meme_characters2/lesteh
['tipichnuy-mu-mitso.jpg'] more_meme_characters2/tipichnuy-mu-mitso
['kony-butt-covered.jpg'] more_meme_characters2/kony-butt-covered
['gopniktipichniy.jpg'] more_meme_charac

## Train-Test splitter

In [4]:
target_dir = 'meme_gallery2'
train_dir = os.path.join(target_dir, 'train')
test_dir = os.path.join(target_dir, 'test')
if not os.path.exists(target_dir):
    os.mkdir(target_dir)
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

In [5]:
X = np.array(os.listdir(new_dataset_dir))
X.shape

(3416,)

In [6]:
X_train, X_test = train_test_split(X, test_size=0.3)
X_train.shape, X_test.shape

((2391,), (1025,))

In [7]:
for meme in X_train:
    shutil.copytree(os.path.join(new_dataset_dir, meme),
                os.path.join(train_dir, meme))
for meme in X_test:
    shutil.copytree(os.path.join(new_dataset_dir, meme),
                os.path.join(test_dir, meme))

In [17]:
import pandas as pd

voc = pd.read_csv('batches/word_count.txt', sep=' ', header=None)
voc.to_csv('model5.1/train/metadata.tsv',
           sep='\t',
           index=False,
           header=['Word', 'Frequency'])

In [15]:
voc.columns

Int64Index([0, 1], dtype='int64')