In [1]:
using NetMSA
using StatsBase

In [2]:
S1 = "abcbcdem";
S2 = "acbcfg";
S3 = "abchimn";
S4 = "abcbcjkm";

L = [S1, S2, S3, S4]

4-element Array{String,1}:
 "abcbcdem"
 "acbcfg"
 "abchimn"
 "abcbcjkm"

In [3]:
M = NetMSA.createPeerMatrix(L)

8×4 Array{Union{Missing, Char},2}:
 'a'  'a'      'a'      'a'
 'b'  'c'      'b'      'b'
 'c'  'b'      'c'      'c'
 'b'  'c'      'h'      'b'
 'c'  'f'      'i'      'c'
 'd'  'g'      'm'      'j'
 'e'  missing  'n'      'k'
 'm'  missing  missing  'm'

In [5]:
mutable struct Position 
  row::Int64
  indexes::Set{Int64}
end

mutable struct Particle
  value::Char
  updated::Int64
  pos::Position
  best::Position
  bestvalue::Float64
  
  function Particle(value::Char, pos::Position)
    return new(value, 0, pos, pos, 0.0)
  end
end

In [8]:
function getposition(index::Int64, row, value)
  indexes = findall(i -> !ismissing(i) && i == value, row)
  return Position(index, Set(indexes));
end

getposition (generic function with 1 method)

In [9]:
Particle('b', getposition(2, M[2,:], 'b'))

Particle('b', 0, Position(2, Set([4, 3, 1])), Position(2, Set([4, 3, 1])), 0.0)

In [10]:
function createSwarm(rowIndex::Int64, row)
  unique = Set(row)
  println(unique)
end

function mostfrequent(row)
  counts = countmap(row);
  delete!(counts, '-');
  max = findmax(counts);
  return max;
end

function aligned(row)::Bool
  row = Set(row)
  return length(row) == 1 || (length(row) == 2 && ('-' in row || missing in row))
end

function full(row)::Bool
  return length(Set(row)) == 1
end

function weight(row; w1=0.25, w2=0.5, w3=1.0)
  if full(row)
    return w3;
  end
  
  max = mostfrequent(row)[1];
  c = length(row);
  if aligned(row)
    return w2 * max / c;
  else
    x = max == 1 ? 0 : max;
    return w1 * x / c;
  end
end

weight (generic function with 1 method)

In [11]:
function objective(M, rowindex::Int; endindex::Int=0)
  weights = sum(weight.(eachrow(M[rowindex:end, :])))
  C = mostfrequent(M[rowindex, :])[1];
  A = sum(aligned.(eachrow(M))[rowindex:end])
  
  endindex = endindex == 0 ? size(M)[1] : endindex;
  if endindex > size(M)[1]
    throw(ArgumentError("endind exceeds the matrix size"));
  end
  counts = countmap(M[rowindex:endindex, :]);
  Gaps = get(counts, '-', 0);
  
  return weights * (A * C)/(1 + Gaps)
end

objective (generic function with 1 method)

In [None]:
for (index, row) in enumerate(eachrow(M))
#   println("$index: $row")
  createSwarm(index, row)
  println(weight(row))
end

In [None]:
objective(M, 2)

In [48]:
function criteria3(p::Particle, newrow)
  return length(p.pos.indexes) != length(getposition(p.pos.row+1, newrow, p.value).indexes)
end

function criteria2(p::Particle)
  return p.bestvalue > 6;
end

function stopcriteria(p::Particle, M)
  return criteria3(p, M[p.pos.row + 1, :]) || criteria2(p);
end

stopcriteria (generic function with 1 method)

In [13]:
p = Particle('b', getposition(2, M[2, :], 'b'))

Particle('b', 0, Position(2, Set([4, 3, 1])), Position(2, Set([4, 3, 1])), 0.0)

true

In [47]:
maximum([length(collect(skipmissing(col))) for col in eachcol(M)])

8

In [None]:
N = copy(M)

In [None]:
N[1, 2] = 'c'

In [None]:
N

In [None]:
M