# Semisupervised territory

## Classify the unlabeled progeny seeds with parents

- We established with the previous steps that Traditional+ECT+UMAP combined shape descriptors provide an accurate description of seed morphology.

- These descriptors balance both spike- and accession-level morphological nuances

- We will focus on the collection of parameters that offered the highest classification results
    - 158 directions
    - 16 thresholds
    - Reduced to 12 dimensions with UMAP
    - Combined with traditional shape descriptors

In [15]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import string
import pandas as pd
%matplotlib inline

import umap

## Load and wrangle data

- This time we load the information for **all** seeds.
   - Progeny
   - $F_{18}$
   - $F_{58}$

In [2]:
src = '../../preproc/ects/results/'
T = 16
d = 158
combined_file = '{}combined_d{}_T{}.csv'.format(src, d, T)
data = pd.read_csv(combined_file)
print(data.shape)
data.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


(37881, 2548)


Unnamed: 0,Scan,Color,Generation,Selection,Progeny,Label (C-G-S-P),Founder,Label,Length,Width,...,10075,10079,10083,10087,10091,10095,10099,10103,10107,10111
0,S001,Blue,1,184,3,B-1-184-3,,2,86.35686,41.55676,...,0,0,1,1,1,1,1,1,1,1
1,S001,Blue,1,184,3,B-1-184-3,,2,86.4778,39.82868,...,0,1,1,1,1,1,1,1,1,1
2,S001,Blue,1,184,3,B-1-184-3,,2,84.80749,40.89001,...,2,1,1,1,1,1,1,1,1,1
3,S001,Blue,1,184,3,B-1-184-3,,2,76.43452,36.51576,...,-5,0,1,1,1,1,1,1,1,1
4,S001,Blue,1,184,3,B-1-184-3,,2,88.91815,41.11696,...,2,1,1,1,1,1,1,1,1,1


In [3]:
founder_names = np.unique(data.Founder.dropna().values)
print(founder_names)

['Algerian' 'Alpha' 'Arequipa' 'Atlas' 'California Mariout (81A)'
 'Club Mariout' 'Everest' 'Flynn' 'Glabron' 'Golden Pheasant'
 'Good Delta (104)' 'Han River' 'Hannchen' 'Horn' 'Lion' 'Lyallpur'
 'Maison Carree (Carre 42)' 'Manchuria' 'Meloy' 'Minia' 'Multan'
 'Oderbrucker' 'Orel' 'Palmella Blue (79B)' 'Sandrel' 'Trebi'
 'White Smyrna' 'Wisconsin Winter']


In [4]:
foo = list(data.columns[:8])
foo.append('Tag')

meta_traits = pd.Index(foo)
trad_traits = data.columns[8:19]
topo_traits = data.columns[20:]

print(meta_traits, trad_traits, topo_traits, sep='\n')

Index(['Scan', 'Color', 'Generation', 'Selection', 'Progeny',
       'Label (C-G-S-P)', 'Founder', 'Label', 'Tag'],
      dtype='object')
Index(['Length', 'Width', 'Height', 'HeightMax', 'Shell', 'Area', 'Vol',
       'ConvexArea', 'ConvexVol', 'ConvexAreaRatio', 'ConvexVolRatio'],
      dtype='object')
Index(['3', '7', '11', '15', '19', '23', '27', '31', '35', '39',
       ...
       '10075', '10079', '10083', '10087', '10091', '10095', '10099', '10103',
       '10107', '10111'],
      dtype='object', length=2528)


## Separate data according to their generation

In [5]:
founders = data[data.Generation == 0]
gen1 = data[data.Generation == 1]
gen7 = data[data.Generation == 7]

print(founders.shape, gen1.shape, gen7.shape, sep='\n')

(3121, 2548)
(27934, 2548)
(6826, 2548)


In [6]:
trad_founders = founders[trad_traits]
trad_gen1 = gen1[trad_traits]
trad_gen7 = gen7[trad_traits]
print(trad_founders.shape, trad_gen1.shape, trad_gen7.shape, sep='\n')
trad_founders.head()

(3121, 11)
(27934, 11)
(6826, 11)


Unnamed: 0,Length,Width,Height,HeightMax,Shell,Area,Vol,ConvexArea,ConvexVol,ConvexAreaRatio,ConvexVolRatio
838,82.15945,47.84965,34.13562,36.91235,7715,13772,66927,9097.25802,68787.66667,1.51386,0.97295
839,77.96905,38.98826,31.59553,33.75345,6263,11876,52632,7686.21513,53353.33333,1.5451,0.98648
840,73.92173,41.98139,32.21753,36.05923,6301,11400,52637,7563.55058,53522.83333,1.50723,0.98345
841,79.44825,42.6868,34.10396,36.59038,7157,12474,60547,8394.33406,61666.66667,1.486,0.98184
842,81.29421,47.44167,35.25533,38.60691,7514,13938,66725,9003.03284,68338.83333,1.54814,0.97638


In [7]:
ect_founders = founders[topo_traits]
ect_gen1 = gen1[topo_traits]
ect_gen7 = gen7[topo_traits]
print(ect_founders.shape, ect_gen1.shape, ect_gen7.shape, sep='\n')
ect_founders.head()

(3121, 2528)
(27934, 2528)
(6826, 2528)


Unnamed: 0,3,7,11,15,19,23,27,31,35,39,...,10075,10079,10083,10087,10091,10095,10099,10103,10107,10111
838,8,-3,5,10,-3,1,1,1,1,1,...,-5,2,1,1,1,1,1,1,1,1
839,1,-1,-3,2,5,-1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
840,11,-1,1,2,1,0,1,1,1,1,...,2,1,1,1,1,1,1,1,1,1
841,1,6,-4,3,-3,1,1,1,1,1,...,0,1,1,1,1,1,1,1,1,1
842,19,5,15,9,0,1,1,1,1,1,...,6,0,1,1,1,1,1,1,1,1


## UMAP dimension reduction

- [Uniform Manifold Approximation and Projection (UMAP)](https://arxiv.org/abs/1802.03426v1) draws several ideas from TDA. 


- Center the shape descriptors and scale them to variance 1

In [8]:
umap_params = {'n_neighbors':50, 'min_dist':0.1, 'n_components':12, 'metric':'manhattan'}
umap_trans = umap.UMAP(**umap_params).fit(ect_founders.values)
u_founders = umap_trans.transform(ect_founders.values)

# Examining the progeny

## Reduce the dimension of the progeny

- UMAP allows dimension reduction based on previous work

- Rather than reducing dimension of the progeny as it is, we can do so based on how we reduced the dimension of the progeny.

In [10]:
u_gen1 = umap_trans.transform(ect_gen1.values)
print(np.mean(u_gen1, axis=0))
print(np.std(u_gen1, axis=0))

[4.7290807 5.4229646 6.037328  8.37816   8.285648  5.4247866 2.7259305
 6.3705177 6.138964  2.9217398 5.181975  4.8773403]
[0.5451504  0.3085204  0.29192272 0.37950334 0.52675617 0.31155252
 0.13315293 0.43406472 0.23332867 0.2303309  0.30220038 0.2508165 ]


In [11]:
gen = 1
u_progeny = u_gen1.copy()
trad_progeny = trad_gen1.copy()

In [41]:
u_gen7 = umap_trans.transform(ect_gen7.values)
print(np.mean(u_gen7, axis=0))
print(np.std(u_gen7, axis=0))

[4.839802  5.4701257 6.1027727 8.512152  8.3021145 5.588169  2.7882106
 6.507294  6.1374087 3.063885  5.178275  5.0165176]
[0.51299477 0.29262874 0.30134046 0.3574716  0.510009   0.2994887
 0.13209003 0.40975672 0.2184346  0.21263145 0.28751692 0.24149393]


In [42]:
gen = 7
u_progeny = u_gen7.copy()
trad_progeny = trad_gen7.copy()

## Scale the progeny accordingly

- Scale progeny with the exact same parameters used to scale the founders

In [50]:
topo_scaler = StandardScaler(with_std=False)
trad_scaler = StandardScaler()

topo_founders_scaled = topo_scaler.fit_transform(u_founders)
trad_founders_scaled = trad_scaler.fit_transform(trad_founders)

founders_scaled = np.column_stack((trad_founders_scaled, topo_founders_scaled))
founders_scaled.shape

(3121, 23)

In [51]:
trad_progeny_scaled = (trad_progeny - trad_scaler.mean_)/trad_scaler.scale_
topo_progeny_scaled = (u_progeny - topo_scaler.mean_)#/topo_scaler.scale_
progeny_scaled = np.column_stack((trad_progeny_scaled, topo_progeny_scaled))
progeny_scaled.shape

(6826, 23)

## Label the progeny with SVM

- Train an SVM with 100% of the founders

- Then use it to label the progeny

In [52]:
info_type = 'combined'

svm_params = {'C':50, 'kernel':'rbf', 'gamma':0.1}
svc = SVC(**svm_params).fit(founders_scaled, founders.Founder.values)

pred_progeny = svc.predict(progeny_scaled)

if gen == 1:
    svm_progeny = gen1[meta_traits].copy()
elif gen == 7:
    svm_progeny = gen7[meta_traits].copy()
svm_progeny.Founder = pred_progeny

In [55]:
dst = '../../preproc/progeny/unsupervised/'
filename = dst + 'gen{}_svm_{}_d{}_T{}_topounscaled.csv'.format(gen, info_type, d,T)
print(filename)
svm_progeny.to_csv(filename, index=False)

../../preproc/progeny/unsupervised/gen7_svm_topological_d158_T16_topounscaled.csv


In [54]:
info_type = 'topological'

svc = SVC(**svm_params).fit(topo_founders_scaled, founders.Founder.values)

pred_progeny = svc.predict(topo_progeny_scaled)

if gen == 1:
    svm_progeny = gen1[meta_traits].copy()
elif gen == 7:
    svm_progeny = gen7[meta_traits].copy()
svm_progeny.Founder = pred_progeny

## Save progeny UMAP reduced dimension

In [47]:
filename = dst + 'umap_gen{}_d{}_T{}_{}_{}_{}_{}.csv'.format(gen, d,T, *umap_params.values())
print(filename)
pd.DataFrame(u_progeny).to_csv(filename, index=False)

../../preproc/progeny/unsupervised/umap_gen7_d158_T16_50_0.1_12_manhattan.csv


In [40]:
filename = dst + 'umap_gen0_d{}_T{}_{}_{}_{}_{}.csv'.format(d,T, *umap_params.values())
print(filename)
pd.DataFrame(u_founders).to_csv(filename, index=False)

../../preproc/progeny/unsupervised/umap_gen0_d158_T16_50_0.1_12_manhattan.csv
