In [6]:
import sys
import glob
import argparse
import numpy as np
import math
import cv2
from scipy.stats import multivariate_normal
from sklearn import mixture
import time
from sklearn import svm
from sklearn.decomposition import PCA
import os
import xml.etree.ElementTree as ET
from sklearn.metrics.pairwise import cosine_similarity
import timeit
from sklearn.metrics import label_ranking_average_precision_score
import pickle
import copy
import matplotlib.pyplot as plt

pca_obj = None
load_gmm_flag = True


def dictionary(descriptors, N):
    '''
    Dictionary of SIFT features using GMM
    '''
    g = mixture.GaussianMixture(n_components=N)
    descriptors = np.asarray(descriptors)
    g.fit(descriptors)
    return g.means_, g.covariances_, g.weights_


def image_descriptors(file):
    '''
    Getting the SIFT descriptors of the image
    '''
    try:
        img = cv2.imread(file, 0)
        # img = cv2.resize(img, (256, 256))
        # _, descriptors = cv2.xfeatures2d.SIFT_create(
        #     nfeatures=50).detectAndCompute(img, None)
        img = cv2.resize(img, (256, 256))
        _, descriptors = cv2.xfeatures2d.SIFT_create().detectAndCompute(
            img, None)
        return descriptors
    except:
        print(file)


def folder_descriptors(folder):
    '''
    Getting all the SIFT image descriptions in a folder
    '''
    files = glob.glob(folder + "/*.png")
    print("Calculating descriptors. Number of images is", len(files))
    return np.concatenate([
        image_descriptors(file) for file in files
        if image_descriptors(file) is not None
    ])


def likelihood_moment(x, ytk, moment):
    '''
    Calculating the likelihood moments
    '''
    x_moment = np.power(np.float32(x), moment) if moment > 0 else np.float32(
        [1])
    return x_moment * ytk


def likelihood_statistics(samples, means, covs, weights):
    '''
    Calculating the likelihood statistics to build the FV
    '''
    gaussians, s0, s1, s2 = {}, {}, {}, {}

    g = [
        multivariate_normal(mean=means[k], cov=covs[k], allow_singular=False)
        for k in range(0, len(weights))
    ]
    for index, x in enumerate(samples):
        gaussians[index] = np.array([g_k.pdf(x) for g_k in g])
    for k in range(0, len(weights)):
        s0[k], s1[k], s2[k] = 0, 0, 0
        for index, x in enumerate(samples):
            probabilities = np.multiply(gaussians[index], weights)
            probabilities = probabilities / np.sum(probabilities)
            s0[k] = s0[k] + likelihood_moment(x, probabilities[k], 0)
            s1[k] = s1[k] + likelihood_moment(x, probabilities[k], 1)
            s2[k] = s2[k] + likelihood_moment(x, probabilities[k], 2)
    return s0, s1, s2


def fisher_vector_weights(s0, s1, s2, means, covs, w, T):
    return np.float32(
        [((s0[k] - T * w[k]) / np.sqrt(w[k])) for k in range(0, len(w))])


def fisher_vector_means(s0, s1, s2, means, sigma, w, T):
    return np.float32([(s1[k] - means[k] * s0[k]) / (np.sqrt(w[k] * sigma[k]))
                       for k in range(0, len(w))])


def fisher_vector_sigma(s0, s1, s2, means, sigma, w, T):
    return np.float32([(s2[k] - 2 * means[k] * s1[k] +
                        (means[k] * means[k] - sigma[k]) * s0[k]) /
                       (np.sqrt(2 * w[k]) * sigma[k])
                       for k in range(0, len(w))])


def normalize(fisher_vector):
    '''
    Power and L2 Normalization
    '''
    v = np.multiply(np.sqrt(abs(fisher_vector)), np.sign(fisher_vector))
    return v / np.sqrt(np.dot(v, v))


def fisher_vector(samples, means, covs, w):
    '''
    Building the FV for a image, sample denotes a list of SIFT feature vectors
    '''
    # global pca_obj
    samples = reduceDimensions(samples)
    # samples = pca_obj.transform(samples)
    s0, s1, s2 = likelihood_statistics(samples, means, covs, w)
    T = samples.shape[0]
    covs = np.float32([np.diagonal(covs[k]) for k in range(0, covs.shape[0])])
    a = fisher_vector_weights(s0, s1, s2, means, covs, w, T)
    b = fisher_vector_means(s0, s1, s2, means, covs, w, T)
    c = fisher_vector_sigma(s0, s1, s2, means, covs, w, T)
    fv = np.concatenate(
        [np.concatenate(a),
         np.concatenate(b),
         np.concatenate(c)])
    fv = normalize(fv)
    # print("Fv")
    # print(fv)
    return np.array(fv)


def reduceDimensions(words):
    '''
    Using PCA to reduce dimensions
    '''
    global pca_obj
    global load_gmm_flag
    # print(words.shape)
    # if(load_gmm_flag):
    #     with open("/home/praveen/Desktop/iiith-assignments/CV/project/35k_weights/pca_dump", 'rb') as handle:
    #         pca_obj = pickle.load(handle)
    try:
        if (pca_obj is None):
            pca = PCA(n_components=64)
            pca_obj = pca.fit(words)
            with open("./pca_dump", 'wb') as handle:
                pickle.dump(pca_obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        res = pca_obj.transform(words)
        return res
    except:
        print("error in Reduce Dimensions")
        print("words shape: {0}".format(words.shape))


def loadPCA(path):
    global pca_obj
    with open(path + "/pca_dump", 'rb') as handle:
        pca_obj = pickle.load(handle)


def generate_gmm(input_folder, N):
    '''
    Generating the GMM and saving the parameters
    '''
    words = np.concatenate([
        folder_descriptors(folder) for folder in glob.glob(input_folder + '/*')
    ])
    words = reduceDimensions(words)
    print("Training GMM of size", N)
    means, covs, weights = dictionary(words, N)
    # Throw away gaussians with weights that are too small:
    # th = 1.0 / N
    th = 0
    means = np.float32(
        [m for k, m in zip(range(0, len(weights)), means) if weights[k] > th])
    covs = np.float32(
        [m for k, m in zip(range(0, len(weights)), covs) if weights[k] > th])
    weights = np.float32([
        m for k, m in zip(range(0, len(weights)), weights) if weights[k] > th
    ])

    np.save("means.gmm", means)
    np.save("covs.gmm", covs)
    np.save("weights.gmm", weights)
    return means, covs, weights


def get_fisher_vectors_from_folder(folder, gmm):
    '''
    Getting the FVs of all the images in the folder
    '''
    files = glob.glob(folder + "/*.png")
    res = {}
    for file in files:
        temp = image_descriptors(file)
        if (temp is not None):
            # print(temp)
            # print(os.path.basename(file))
            res[os.path.basename(file)] = np.float32(fisher_vector(temp, *gmm))
    return res
    # return np.float32([fisher_vector(image_descriptors(file), *gmm) for file in files])


def fisher_features(folder, gmm):
    '''
    Getting the FVs of all the images in the subfolders in the directory
    '''
    folders = glob.glob(folder + "/*")
    res = {}
    for f in folders:
        res.update(get_fisher_vectors_from_folder(f, gmm))
    return res


def get_image_mapping_from_folder(folder):
    '''
    Getting the Image Name to absolute path mapping
    '''
    files = glob.glob(folder + "/*.png")
    res = {}
    for file in files:
        res[os.path.basename(file)] = os.path.abspath(file)
    return res
    # return np.float32([fisher_vector(image_descriptors(file), *gmm) for file in files])


def get_image_mappings(folder):
    '''
    Getting the Image Name to absolute path mapping recursively
    '''
    folders = glob.glob(folder + "/*")
    res = {}
    for f in folders:
        res.update(get_image_mapping_from_folder(f))
    return res


def train(gmm, features):
    '''
    Not used
    '''
    print(features)
    X = np.concatenate(features.values)
    Y = np.concatenate([
        np.float32([i] * len(v))
        for i, v in zip(range(0, len(features)), features.values())
    ])

    clf = svm.SVC()
    clf.fit(X, Y)
    return clf


def compare(features):
    pass


def success_rate(classifier, features):
    '''
    Not used
    '''
    print("Applying the classifier...")
    X = np.concatenate(np.array(features.values()))
    Y = np.concatenate([
        np.float32([i] * len(v))
        for i, v in zip(range(0, len(features)), features.values())
    ])
    res = float(sum([a == b
                     for a, b in zip(classifier.predict(X), Y)])) / len(Y)
    return res


def load_gmm(folder=""):
    '''
    Not used
    '''
    print("in load gmm")
    print(folder)
    files = ["means.gmm.npy", "covs.gmm.npy", "weights.gmm.npy"]
    res = map(lambda file: np.load(file), map(lambda s: folder + "/" + s,
                                              files))
    # print(list(res))
    return res


def get_word_strings_from_file(file_path):
    '''
    Getting the word strings from the xml filepath
    '''
    res = {}
    tree = ET.parse(file_path)
    root = tree.getroot()
    lines = root.findall("./handwritten-part/line")
    for line in lines:
        for word in line.findall('word'):
            id = word.get('id')
            word_string = word.get('text')
            res[id + ".png"] = word_string
    return res


def extractWordStrings(folder_path):
    '''
    Extracting the word strings from all the xml files present in the folder
    '''
    word_strings = {}
    folders = glob.glob(folder_path + "/*.xml")
    for file in folders:
        word_strings.update(get_word_strings_from_file(file))
    return word_strings


def get_word_string(path):
    pass


def MAPScore(query_path,
             word_strings_dict,
             fisher_features,
             gmm,
             image_mapping_dict,
             show_img_flag=False):
    '''
    Getting the MAP score for the given image query
    '''
    if (show_img_flag):
        img = plt.imread(query_path)
        imgplot = plt.imshow(img)
        plt.show()
    query_sift_features = image_descriptors(query_path)
    if (query_sift_features is None):
        return 0
    # print("path: {0}".format(query_path))
    # print(query_sift_features.shape)
    temp = copy.deepcopy(gmm)
    query_FV = fisher_vector(query_sift_features, *temp)
    # print(query_FV)
    query_FV = query_FV.reshape(1, -1)
    FV_values = np.array(list(fisher_features.values()))
    FV_keys = np.array(list(fisher_features.keys()))
    similarity_score = cosine_similarity(query_FV, FV_values)
    # print(similarity_score.shape)
    max_index = np.argmax(similarity_score)
    top_5_indices = similarity_score.flatten().argsort()[-5:][::-1]
    if (show_img_flag):
        print("top 5 indices {0}".format(top_5_indices))
        for i in top_5_indices:
            match_img_path = image_mapping_dict[FV_keys[i]]
            print("Matching image path: {0}".format(match_img_path))
            img = plt.imread(match_img_path)
            imgplot = plt.imshow(img)
            plt.show()
    query_string = word_strings_dict[os.path.basename(query_path)]
    word_vals = np.array(
        [word_strings_dict[your_key] for your_key in fisher_features.keys()])
    word_vals = word_vals.flatten()
    y_true = np.array(
        [[int(1) if s == query_string else int(0) for s in word_vals]])
    mape = label_ranking_average_precision_score(y_true, similarity_score)
    return mape


def get_args():
    '''
    Getting the command line arguments
    '''
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-d', "--dir", help="Directory with images", default='.')
    parser.add_argument(
        '-dxml', "--dirxml", help="Directory with xml", default='.')
    parser.add_argument(
        "-g",
        "--loadgmm",
        help="Load Gmm dictionary",
        action='store_true',
        default=False)
    parser.add_argument(
        '-n',
        "--number",
        help="Number of words in dictionary",
        default=16,
        type=int)
    args = parser.parse_args()
    return args


####################################     Main     #####################################
working_folder = "/home/praveen/Desktop/iiith-assignments/CV/project/kaggle_data_35k/a01"
dir_xml = "/home/praveen/Desktop/iiith-assignments/CV/project/kaggle_data_35k/xml_testing/"
load_folder = "35k_weights"

print(working_folder)
no_gaussians = 16
print("no. of weights {0}".format(no_gaussians))
start = timeit.default_timer()
gmm = load_gmm(load_folder) if load_gmm_flag else generate_gmm(
    working_folder, no_gaussians)
stop = timeit.default_timer()
print('Time taken for training GMM: ', stop - start)
print("gmm: {0}".format(gmm))

FV_features = None
if (load_gmm_flag):
    loadPCA(load_folder)
if (load_gmm_flag):
    with open(load_folder + "/FV_dump", 'rb') as handle:
        FV_features = pickle.load(handle)
else:
    FV_features = fisher_features(working_folder, gmm)
    with open("./FV_dump", 'wb') as handle:
        pickle.dump(fisher_features, handle, protocol=pickle.HIGHEST_PROTOCOL)

word_strings_dict = None
if (load_gmm_flag):
    with open(load_folder + "/word_string_dict_dump", 'rb') as handle:
        word_strings_dict = pickle.load(handle)
else:
    word_strings_dict = extractWordStrings(dir_xml)
    with open("./word_string_dict_dump", 'wb') as handle:
        pickle.dump(
            word_strings_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
image_mapping_dict = get_image_mappings(working_folder)
scores = []
while (True):
    query_type = input(
        "Press 1 for test of multiple images\nPress 2 for single image\nPress 0 to exit\n"
    )
    if (int(query_type) == 0):
        break
    if (int(query_type) == 1):
        score_list = []
        test_data_path = input("Enter query images folder path: ")
        folders = glob.glob(test_data_path + "/*")
        count = 0
        for folder in folders:
            image_paths = glob.glob(folder + "/*.png")
            for img_path in image_paths:
                count += 1
                # print("count: {0}".format(count))
                score = MAPScore(img_path, word_strings_dict, FV_features, gmm,
                                 image_mapping_dict, False)
                score_list.append(score)
        score_list = np.array(score_list)
        print("MAP Score: {0}".format(np.mean(score_list)))
    else:
        query_path = input("Enter query image path: ")
        if (query_path == "break"):
            break
        score = MAPScore(query_path, word_strings_dict, FV_features, gmm,
                         image_mapping_dict, True)
        scores.append(score)
        print("MAP Score: {0}".format(score))

/home/praveen/Desktop/iiith-assignments/CV/project/kaggle_data_35k/a01
no. of weights 16
in load gmm
35k_weights
Time taken for training GMM:  0.00017261299944948405
gmm: <map object at 0x7fd6551a3400>


FileNotFoundError: [Errno 2] No such file or directory: '35k_weights/pca_dump'

# New Section

In [None]:
words = np.concatenate([folder_descriptors(folder)
                          for folder in glob.glob(input_folder + '/*')])
words = reduceDimensions(words)

In [5]:
!pip uninstall -y opencv-python
!pip uninstall -y opencv-contrib-python
!pip install opencv-contrib-python==3.3.0.9

[33mSkipping opencv-python as it is not installed.[0m
Uninstalling opencv-contrib-python-3.3.0.9:
  Successfully uninstalled opencv-contrib-python-3.3.0.9
Collecting opencv-contrib-python==3.3.0.9
  Using cached https://files.pythonhosted.org/packages/4f/72/5d4dbcac00066dc71b3b05a6e5562e05b077e4719f4cf0b059ab9ce667cc/opencv_contrib_python-3.3.0.9-cp36-cp36m-manylinux1_x86_64.whl
Installing collected packages: opencv-contrib-python
Successfully installed opencv-contrib-python-3.3.0.9


In [3]:
!unzip 'test_data.zip'

Archive:  test_data.zip
   creating: test_data/
   creating: test_data/test_seen_data/
   creating: test_data/test_seen_data/a01-014u/
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-00.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-01.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-02.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-03.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-04.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-00-05.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-00.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-01.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-02.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-03.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-04.png  
  inflating: test_data/test_seen_data/a01-014u/a01-014u-01-05.png  
  inflating: test_data/test_seen_data/a01-014u/a0

In [4]:
!unzip '35k_weights.zip'

Archive:  35k_weights.zip
   creating: 35k_weights/
  inflating: 35k_weights/covs.gmm.npy  
  inflating: 35k_weights/means.gmm.npy  
  inflating: 35k_weights/weights.gmm.npy  
  inflating: 35k_weights/pca_dump    
  inflating: 35k_weights/FV_dump     
  inflating: 35k_weights/word_string_dict_dump  


In [1]:
print("hello")

hello


In [7]:
import numpy as np
from sklearn.kernel_approximation import RBFSampler
from copy import deepcopy

In [8]:
bigrams = """ AA 0.003% BA 0.146% CA 0.538% DA 0.151% EA 0.688% FA 0.164% GA 0.148% HA 0.926% IA 0.286% JA 0.026% KA 0.017% LA 0.528% MA 0.565% 
NA 0.347% OA 0.057% PA 0.324% QA 0.000% RA 0.686% SA 0.218% TA 0.530% UA 0.136% VA 0.140% WA 0.385% XA 0.030% YA 0.016% ZA 0.025% 
AB 0.230% BB 0.011% CB 0.001% DB 0.003% EB 0.027% FB 0.000% GB 0.000% HB 0.004% IB 0.099% JB 0.000% KB 0.001% LB 0.007% MB 0.090% 
NB 0.004% OB 0.097% PB 0.001% QB 0.000% RB 0.027% SB 0.008% TB 0.003% UB 0.089% VB 0.000% WB 0.001% XB 0.000% YB 0.004% ZB 0.000% 
AC 0.448% BC 0.002% CC 0.083% DC 0.003% EC 0.477% FC 0.001% GC 0.000% HC 0.001% IC 0.699% JC 0.000% KC 0.000% LC 0.012% MC 0.004% 
NC 0.416% OC 0.166% PC 0.001% QC 0.000% RC 0.121% SC 0.155% TC 0.026% UC 0.188% VC 0.000% WC 0.001% XC 0.026% YC 0.014% ZC 0.000% 
AD 0.368% BD 0.002% CD 0.002% DD 0.043% ED 1.168% FD 0.000% GD 0.003% HD 0.003% ID 0.296% JD 0.000% KD 0.001% LD 0.253% MD 0.001% 
ND 1.352% OD 0.195% PD 0.001% QD 0.000% RD 0.189% SD 0.005% TD 0.001% UD 0.091% VD 0.000% WD 0.004% XD 0.000% YD 0.007% ZD 0.000% 
AE 0.012% BE 0.576% CE 0.651% DE 0.765% EE 0.378% FE 0.237% GE 0.385% HE 3.075% IE 0.385% JE 0.052% KE 0.214% LE 0.829% ME 0.793% 
NE 0.692% OE 0.039% PE 0.478% QE 0.000% RE 1.854% SE 0.932% TE 1.205% UE 0.147% VE 0.825% WE 0.361% XE 0.022% YE 0.093% ZE 0.050% 
AF 0.074% BF 0.000% CF 0.001% DF 0.003% EF 0.163% FF 0.146% GF 0.001% HF 0.002% IF 0.203% JF 0.000% KF 0.002% LF 0.053% MF 0.004% 
NF 0.067% OF 1.175% PF 0.001% QF 0.000% RF 0.032% SF 0.017% TF 0.006% UF 0.019% VF 0.000% WF 0.002% XF 0.002% YF 0.001% ZF 0.000% 
AG 0.205% BG 0.000% CG 0.001% DG 0.031% EG 0.120% FG 0.001% GG 0.025% HG 0.000% IG 0.255% JG 0.000% KG 0.003% LG 0.006% MG 0.001% 
NG 0.953% OG 0.094% PG 0.000% QG 0.000% RG 0.100% SG 0.002% TG 0.002% UG 0.128% VG 0.000% WG 0.000% XG 0.000% YG 0.003% ZG 0.000%
AH 0.014% BH 0.001% CH 0.598% DH 0.005% EH 0.026% FH 0.000% GH 0.228% HH 0.001% IH 0.002% JH 0.000% KH 0.003% LH 0.002% MH 0.001% 
NH 0.011% OH 0.021% PH 0.094% QH 0.000% RH 0.015% SH 0.315% TH 3.556% UH 0.001% VH 0.000% WH 0.379% XH 0.004% YH 0.001% ZH 0.001% 
AI 0.316% BI 0.107% CI 0.281% DI 0.493% EI 0.183% FI 0.285% GI 0.152% HI 0.763% II 0.023% JI 0.003% KI 0.098% LI 0.624% MI 0.318% 
NI 0.339% OI 0.088% PI 0.123% QI 0.000% RI 0.728% SI 0.550% TI 1.343% UI 0.101% VI 0.270% WI 0.374% XI 0.039% YI 0.029% ZI 0.012% 
AJ 0.012% BJ 0.023% CJ 0.000% DJ 0.005% EJ 0.005% FJ 0.000% GJ 0.000% HJ 0.000% IJ 0.001% JJ 0.000% KJ 0.000% LJ 0.000% MJ 0.000%
NJ 0.011% OJ 0.007% PJ 0.000% QJ 0.000% RJ 0.001% SJ 0.000% TJ 0.000% UJ 0.001% VJ 0.000% WJ 0.000% XJ 0.000% YJ 0.000% ZJ 0.000% 
AK 0.105% BK 0.000% CK 0.118% DK 0.000% EK 0.016% FK 0.000% GK 0.000% HK 0.000% IK 0.043% JK 0.000% KK 0.000% LK 0.020% MK 0.000% 
NK 0.052% OK 0.064% PK 0.001% QK 0.000% RK 0.097% SK 0.039% TK 0.000% UK 0.005% VK 0.000% WK 0.001% XK 0.000% YK 0.000% ZK 0.000% 
AL 1.087% BL 0.233% CL 0.149% DL 0.032% EL 0.530% FL 0.065% GL 0.061% HL 0.013% IL 0.432% JL 0.000% KL 0.011% LL 0.577% ML 0.005% 
NL 0.064% OL 0.365% PL 0.263% QL 0.000% RL 0.086% SL 0.056% TL 0.098% UL 0.346% VL 0.000% WL 0.015% XL 0.001% YL 0.015% ZL 0.001%
AM 0.285% BM 0.003% CM 0.003% DM 0.018% EM 0.374% FM 0.001% GM 0.010% HM 0.013% IM 0.318% JM 0.000% KM 0.002% LM 0.023% MM 0.096% 
NM 0.028% OM 0.546% PM 0.016% QM 0.000% RM 0.175% SM 0.065% TM 0.026% UM 0.138% VM 0.000% WM 0.001% XM 0.000% YM 0.024% ZM 0.000% 
AN 1.985% BN 0.002% CN 0.001% DN 0.008% EN 1.454% FN 0.000% GN 0.066% HN 0.026% IN 2.433% JN 0.000% KN 0.051% LN 0.006% MN 0.009% 
NN 0.073% ON 1.758% PN 0.001% QN 0.000% RN 0.160% SN 0.009% TN 0.010% UN 0.394% VN 0.000% WN 0.079% XN 0.000% YN 0.013% ZN 0.000% 
AO 0.005% BO 0.195% CO 0.794% DO 0.188% EO 0.073% FO 0.488% GO 0.132% HO 0.485% IO 0.835% JO 0.054% KO 0.006% LO 0.387% MO 0.337% 
NO 0.465% OO 0.210% PO 0.361% QO 0.000% RO 0.727% SO 0.398% TO 1.041% UO 0.011% VO 0.071% WO 0.222% XO 0.003% YO 0.150% ZO 0.007% 
AP 0.203% BP 0.001% CP 0.001% DP 0.002% EP 0.172% FP 0.000% GP 0.000% HP 0.001% IP 0.089% JP 0.000% KP 0.001% LP 0.019% MP 0.239% 
NP 0.006% OP 0.224% PP 0.137% QP 0.000% RP 0.042% SP 0.191% TP 0.004% UP 0.136% VP 0.000% WP 0.001% XP 0.067% YP 0.025% ZP 0.000% 
AQ 0.002% BQ 0.000% CQ 0.005% DQ 0.001% EQ 0.057% FQ 0.000% GQ 0.000% HQ 0.000% IQ 0.011% JQ 0.000% KQ 0.000% LQ 0.000% MQ 0.000% 
NQ 0.006% OQ 0.001% PQ 0.000% QQ 0.000% RQ 0.001% SQ 0.007% TQ 0.000% UQ 0.000% VQ 0.000% WQ 0.000% XQ 0.000% YQ 0.000% ZQ 0.000% 
AR 1.075% BR 0.112% CR 0.149% DR 0.085% ER 2.048% FR 0.213% GR 0.197% HR 0.084% IR 0.315% JR 0.000% KR 0.003% LR 0.010% MR 0.003% 
NR 0.009% OR 1.277% PR 0.474% QR 0.000% RR 0.121% SR 0.006% TR 0.426% UR 0.543% VR 0.001% WR 0.031% XR 0.000% YR 0.008% ZR 0.000%
AS 0.871% BS 0.046% CS 0.023% DS 0.126% ES 1.339% FS 0.006% GS 0.051% HS 0.015% IS 1.128% JS 0.000% KS 0.048% LS 0.142% MS 0.093% 
NS 0.509% OS 0.290% PS 0.055% QS 0.000% RS 0.397% SS 0.405% TS 0.337% US 0.454% VS 0.001% WS 0.035% XS 0.000% YS 0.097% ZS 0.000%
AT 1.487% BT 0.017% CT 0.461% DT 0.003% ET 0.413% FT 0.082% GT 0.015% HT 0.130% IT 1.123% JT 0.000% KT 0.001% LT 0.124% MT 0.001% 
NT 1.041% OT 0.442% PT 0.106% QT 0.000% RT 0.362% ST 1.053% TT 0.171% UT 0.405% VT 0.000% WT 0.007% XT 0.047% YT 0.017% ZT 0.000% 
AU 0.119% BU 0.185% CU 0.163% DU 0.148% EU 0.031% FU 0.096% GU 0.086% HU 0.074% IU 0.017% JU 0.059% KU 0.003% LU 0.135% MU 0.115% 
NU 0.079% OU 0.870% PU 0.105% QU 0.148% RU 0.128% SU 0.311% TU 0.255% UU 0.001% VU 0.002% WU 0.001% XU 0.005% YU 0.001% ZU 0.002% 
AV 0.205% BV 0.004% CV 0.000% DV 0.019% EV 0.255% FV 0.000% GV 0.000% HV 0.000% IV 0.288% JV 0.000% KV 0.000% LV 0.035% MV 0.000%
NV 0.052% OV 0.178% PV 0.000% QV 0.000% RV 0.069% SV 0.001% TV 0.001% UV 0.003% VV 0.000% WV 0.000% XV 0.002% YV 0.000% ZV 0.000% 
AW 0.060% BW 0.000% CW 0.000% DW 0.008% EW 0.117% FW 0.000% GW 0.001% HW 0.005% IW 0.001% JW 0.000% KW 0.002% LW 0.013% MW 0.001% 
NW 0.006% OW 0.330% PW 0.001% QW 0.000% RW 0.013% SW 0.024% TW 0.082% UW 0.000% VW 0.000% WW 0.000% XW 0.000% YW 0.003% ZW 0.000% 
AX 0.019% BX 0.000% CX 0.000% DX 0.000% EX 0.214% FX 0.000% GX 0.000% HX 0.000% IX 0.022% JX 0.000% KX 0.000% LX 0.000% MX 0.000%
NX 0.003% OX 0.019% PX 0.000% QX 0.000% RX 0.001% SX 0.000% TX 0.000% UX 0.004% VX 0.000% WX 0.000% XX 0.003% YX 0.000% ZX 0.000% 
AY 0.217% BY 0.176% CY 0.042% DY 0.050% EY 0.144% FY 0.009% GY 0.026% HY 0.050% IY 0.000% JY 0.000% KY 0.006% LY 0.425% MY 0.062% 
NY 0.098% OY 0.036% PY 0.012% QY 0.000% RY 0.248% SY 0.057% TY 0.227% UY 0.005% VY 0.005% WY 0.002% XY 0.003% YY 0.000% ZY 0.002%
AZ 0.012% BZ 0.000% CZ 0.001% DZ 0.000% EZ 0.005% FZ 0.000% GZ 0.000% HZ 0.000% IZ 0.064% JZ 0.000% KZ 0.000% LZ 0.000% MZ 0.000% 
NZ 0.004% OZ 0.003% PZ 0.000% QZ 0.000% RZ 0.001% SZ 0.000% TZ 0.004% UZ 0.002% VZ 0.000% WZ 0.000% XZ 0.000% YZ 0.002% ZZ 0.003%"""

In [9]:
A = bigrams.split()
words = [A[i] for i in range(len(A)) if i%2==0 ]
percentage = [float(A[i].split('%')[0]) for i in range(len(A)) if i%2==1]
percentage, words = zip(*sorted(zip(percentage, words), reverse=True))
most_frequent_bigrams = words[:75]

In [10]:
class PHOC():
    def __init__(self):
        self.most_frequent_bigrams = most_frequent_bigrams
        self.most_frequent_bigrams = [
            i.lower() for i in self.most_frequent_bigrams
        ]

    def calculate_frequency(self, word):
        frequency = [0] * 26
        for c in word:
            frequency[ord(c) - ord('a')] = 1
        return frequency

    def calculate_frequency_of_bigram(self, word):
        frequency = [0] * 75
        for i, bigram in enumerate(self.most_frequent_bigrams):
            if bigram in word:
                frequency[i] = 1
        return frequency

    def calculate_level_2(self, word):
        n = int(len(word) / 2)
        one, two = word[:n], word[n:]
        result = self.calculate_frequency(one)
        result += self.calculate_frequency(two)
        return result

    def calculate_level_2_bigram(self, word):
        n = int(len(word) / 2)
        one, two = word[:n], word[n:]
        result = self.calculate_frequency_of_bigram(one)
        result += self.calculate_frequency_of_bigram(two)
        return result

    def calculate_level_3(self, word):
        n = int(len(word) / 3)
        one, two = word[:n], word[n:]
        result = self.calculate_frequency(one)
        result += self.calculate_level_2(two)
        return result

    def calculate_level_4(self, word):
        n = int(len(word) / 2)
        one, two = word[:n], word[n:]
        result = self.calculate_level_2(one)
        result += self.calculate_level_2(two)
        return result

    def __call__(self, word_in):
        word = deepcopy(word_in).lower()
        result = []
        result += self.calculate_level_2(word)
        result += self.calculate_level_3(word)
        result += self.calculate_level_4(word)
        result += self.calculate_level_2_bigram(word)
        return result

In [11]:
L = PHOC()('beyond')

In [14]:
set(L)

{0, 1}

In [12]:
sum(L)

21

In [53]:
class CCA():
    def __init__(self, output_dim=192):
        self.output_dim = 192
    
    def find_w(self, Caa, Cab, Cbb, Cba):
        Z = np.multiply(np.linalg.inv(Caa), Cab)
        Z = np.multiply(Z, np.linalg.inv(Cbb))
        Z = np.multiply(Z, Cba)
        eigen_values, eigen_vectors = np.linalg.eig(Z)
        eigen_values = np.absolute(eigen_values)
        ix = np.argsort(eigen_values)[::-1]
        eigen_vectors = eigen_vectors[ix]
        return eigen_vectors[:self.output_dim]
        
    ### shape of A Nxd
    ### shape of B Nxd
    def fit(self, A, B):
        A = deepcopy(A).T
        B = deepcopy(B).T
        N = len(A) 
        mu_a = A.mean(axis=1)
        mu_b = B.mean(axis=1)
        Caa = (1/N)*(np.multiply((A-mu_a), (A-mu_a).T))
        Cbb = (1/N)*(np.multiply((B-mu_a), (B-mu_b).T))
        Cab = (1/N)*(np.multiply((A-mu_a), (B-mu_b).T))
        Cba = Cab.T
        self.project_a = find_w(Caa, Cab, Cbb, Cba)
        self.project_b = find_w(Cbb, Cba, Caa, Cab)
        
    def transform_a(self, A):
        A = deepcopy(A)
        return np.multiply(A, project_a)
    
    def transform_b(self, B):
        B = deepcopy(B)
        return np.multiply(B, project_b)

In [54]:
class KCCA():
    def __init__(self):
        self.CCA = CCA(output_dim=256)
    
    ### shape of A Nxd
    def fit(self, A, B):
        A = deepcopy(A)
        B = deepcopy(B)
        self.rbf_feature_A = RBFSampler(gamma=1, n_components=len(A))
        self.rbf_feature_B = RBFSampler(gamma=1, n_components=len(B))
        self.rbf_feature_A.fit(A)
        self.rbf_feature_B.fit(B)
        A = self.rbf_feature_A.transform(A)
        B = self.rbf_feature_B.transform(B)
        self.CCA.fit(A, B)
        
    def transform_a(self, A):
        A = deepcopy(A)
        A = self.rbf_feature_A.transform(A)
        return self.CCA.transform_a(A)
    
    def transform_b(self, B):
        B = deepcopy(B)
        B = self.rbf_feature_B.transform(B)
        return self.CCA.transform_b(B)

In [15]:
import numpy as np
import multiprocessing as mp
from functools import partial

In [25]:
def my_func(x,y):
    y =  x*y
    return x, y, 4

def main():
    print(mp.cpu_count())
    pool = mp.Pool(mp.cpu_count())
    func = partial(my_func, 2)
    result = pool.map(func, [i for i in range(6)])
    pool.close()
    pool.join()
    print(result)
    return result

In [28]:
t = main()
w = [x[1] for x in t]
print(w)

4
[(2, 0, 4), (2, 2, 4), (2, 4, 4), (2, 6, 4), (2, 8, 4), (2, 10, 4)]
[0, 2, 4, 6, 8, 10]


In [22]:
from sklearn.cross_decomposition import CCA
import numpy as np
X = [[0., 0., 1.], [1.,0.,0.], [2.,2.,2.], [3.,5.,4.]]
Y = [[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]
cca = CCA(n_components=1)
cca.fit(X, Y)

X__ = cca.transform(X,None)
X_c, Y_c = cca.transform(X, Y)

cca2 = CCA(n_components=1)
cca2.fit(Y, X)
Y_c2, X_c2 = cca2.transform(Y, X)
y__ = cca2.transform(Y, None)
print(X__)
print(y__)
print()
print(np.dot(np.array(X), np.array(cca.x_rotations_)))
print(np.dot(np.array(Y), np.array(cca2.x_rotations_)))
print("######################################")
print(X_c)
print()
print(X_c2)
print("######################################")
print(Y_c)
print()
print(Y_c2)

[[-1.3373174 ]
 [-1.10847164]
 [ 0.40763151]
 [ 2.03815753]]
[[-0.46290954]
 [-0.31107219]
 [ 0.07773969]
 [ 0.69624204]]

[[0.60252733]
 [0.75090645]
 [3.24760898]
 [6.01468223]]
[[-0.22330781]
 [ 0.64764529]
 [ 2.92412441]
 [ 6.50661277]]
######################################
[[-1.3373174 ]
 [-1.10847164]
 [ 0.40763151]
 [ 2.03815753]]

[[-0.81747774]
 [-0.54933971]
 [ 0.13728484]
 [ 1.22953261]]
######################################
[[-0.85511537]
 [-0.70878547]
 [ 0.26065014]
 [ 1.3032507 ]]

[[-0.46290954]
 [-0.31107219]
 [ 0.07773969]
 [ 0.69624204]]
