#!/usr/bin/env python3

In [1]:
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
sns.color_palette('Spectral')
import matplotlib.pyplot as plt
import numpy as np
import requests
import pandas as pd
import re
import matplotlib.pyplot as plt
import cv2
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
from lineage import Lineage

In [2]:
data = pd.read_csv('genomee.txt', sep='\t', dtype={'rsid':'str', 'chromosome':'object', 'position':'int', 'genotype':'str'}, comment='#')

In [3]:
print(data)

               rsid chromosome  position genotype
0       rs548049170          1     69869       TT
1        rs13328684          1     74792       --
2         rs9283150          1    565508       AA
3           i713426          1    726912       AA
4       rs116587930          1    727841       GG
...             ...        ...       ...      ...
638542     i4000693         MT     16524        A
638543      i704756         MT     16524        A
638544      i705255         MT     16525        A
638545     i4000757         MT     16526        G
638546      i701671         MT     16526        G

[638547 rows x 4 columns]


In [4]:
df = pd.DataFrame(data)

In [5]:
df.head(25)

Unnamed: 0,rsid,chromosome,position,genotype
0,rs548049170,1,69869,TT
1,rs13328684,1,74792,--
2,rs9283150,1,565508,AA
3,i713426,1,726912,AA
4,rs116587930,1,727841,GG
5,rs3131972,1,752721,AG
6,rs12184325,1,754105,CC
7,rs12567639,1,756268,AA
8,rs114525117,1,759036,GG
9,rs12124819,1,776546,--


In [6]:
df.nunique()

 rsid         638547
chromosome        25
position      634977
genotype          20
dtype: int64

In [7]:
duplicates = df[df.duplicated(subset='position')]
display(duplicates.head())
display(duplicates.info())

Unnamed: 0,rsid,chromosome,position,genotype
449,i6059967,1,2526746,GG
2816,i6052145,1,11009679,GG
5325,i6012699,1,19992513,CC
5339,i6059797,1,20020994,CT
5791,i6058167,1,21795388,AA


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3570 entries, 449 to 638546
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0    rsid       3570 non-null   object
 1   chromosome  3570 non-null   object
 2   position    3570 non-null   int64 
 3   genotype    3570 non-null   object
dtypes: int64(1), object(3)
memory usage: 139.5+ KB


None

In [8]:
df = df[df.chromosome != 'Y']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 634814 entries, 0 to 638546
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0    rsid       634814 non-null  object
 1   chromosome  634814 non-null  object
 2   position    634814 non-null  int64 
 3   genotype    634814 non-null  object
dtypes: int64(1), object(3)
memory usage: 24.2+ MB


In [9]:
df['chromosome'].unique()

array(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12',
       '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X',
       'MT'], dtype=object)

In [10]:
df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'X', r'23', x))
df['chromosome'] = df['chromosome'].apply(lambda x: re.sub(r'MT', r'24', x))

In [11]:
df['chromosome'] = df['chromosome'].apply(lambda x: int(x))

In [12]:
chromosome_dict = {1:'1', 2:'2', 3:'3', 4:'4', 5:'5', 6:'6', 7:'7', 8:'8', 9:'9', 10:'10', 11:'11', 12:'12', 13:'13', 
                  14:'14', 15:'15', 16:'16', 17:'17', 18:'18', 19:'19', 20:'20', 21:'21', 22:'22', 23:'X', 24:'MT'}

In [13]:
print(chromosome_dict)
df.info()

{1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7', 8: '8', 9: '9', 10: '10', 11: '11', 12: '12', 13: '13', 14: '14', 15: '15', 16: '16', 17: '17', 18: '18', 19: '19', 20: '20', 21: '21', 22: '22', 23: 'X', 24: 'MT'}
<class 'pandas.core.frame.DataFrame'>
Int64Index: 634814 entries, 0 to 638546
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0    rsid       634814 non-null  object
 1   chromosome  634814 non-null  int64 
 2   position    634814 non-null  int64 
 3   genotype    634814 non-null  object
dtypes: int64(2), object(2)
memory usage: 24.2+ MB


In [14]:
genotype_na = df[df.genotype == '--']
len(genotype_na)

16733

In [15]:
df[df.chromosome == 1].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49516 entries, 0 to 49515
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0    rsid       49516 non-null  object
 1   chromosome  49516 non-null  int64 
 2   position    49516 non-null  int64 
 3   genotype    49516 non-null  object
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [16]:
df.rename({' rsid': 'rsid'}, axis='columns', inplace=True)

In [17]:
rsid_per_chromosome_series = df.groupby('chromosome')['rsid'].count()
rsid_per_chromosome_series.columns = ['chromosome', 'count']

In [18]:
rsid_per_chromosome_series.plot.barh(figsize=(16,9), fontsize=15)
plt.show()
plt.savefig("counts.png")

  


In [19]:
snp_df = pd.read_csv('result.csv')
snp_df.head()

Unnamed: 0.1,Unnamed: 0,Magnitude,Summary
0,Rs1801253(G;G),0.0,
1,Rs17822931(T;T),2.5,Dry earwax. No body odour. Likely Asian ancest...
2,Rs16891982(C;C),1.1,"generally non-European, but if European, 7x mo..."
3,Rs351855(C;C),0.0,normal
4,Rs713598(C;C),1.1,Can taste bitter.


In [20]:
snp_df['genotype'] = snp_df['Unnamed: 0'].apply(lambda x: re.sub(r'.*([AGCT]);([AGCT])\)', r'\1\2', x))

In [21]:
snp_df.head()

Unnamed: 0.1,Unnamed: 0,Magnitude,Summary,genotype
0,Rs1801253(G;G),0.0,,GG
1,Rs17822931(T;T),2.5,Dry earwax. No body odour. Likely Asian ancest...,TT
2,Rs16891982(C;C),1.1,"generally non-European, but if European, 7x mo...",CC
3,Rs351855(C;C),0.0,normal,CC
4,Rs713598(C;C),1.1,Can taste bitter.,CC


In [22]:
new_cols = ['rsid', 'magnitude', 'summary', 'genotype']
snp_df.columns = new_cols

In [23]:
snp_df['rsid'] = snp_df['rsid'].map(lambda x : x.lower())
snp_df['rsid'] = snp_df['rsid'].map(lambda x : re.sub(r'([a-z]{1,}[\d]+)\([agct];[agct]\)', r'\1', x))

In [24]:
snp_df.head()

Unnamed: 0,rsid,magnitude,summary,genotype
0,rs1801253,0.0,,GG
1,rs17822931,2.5,Dry earwax. No body odour. Likely Asian ancest...,TT
2,rs16891982,1.1,"generally non-European, but if European, 7x mo...",CC
3,rs351855,0.0,normal,CC
4,rs713598,1.1,Can taste bitter.,CC


In [25]:
snp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11000 entries, 0 to 10999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   rsid       11000 non-null  object 
 1   magnitude  11000 non-null  float64
 2   summary    10866 non-null  object 
 3   genotype   11000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 343.9+ KB


In [26]:
snp_df.isna().any()

rsid         False
magnitude    False
summary       True
genotype     False
dtype: bool

In [27]:
new_df = snp_df.merge(df, how='inner', on=['rsid', 'genotype'], suffixes=('_SNPedia', '_myDNA'))

In [28]:
new_df.head(1000000)

Unnamed: 0,rsid,magnitude,summary,genotype,chromosome,position
0,rs16891982,1.1,"generally non-European, but if European, 7x mo...",CC,5,33951693
1,rs16891982,1.1,"generally non-European, but if European, 7x mo...",CC,5,33951693
2,rs696217,0.0,common in clinvar,GG,3,10331457
3,rs696217,0.0,common in clinvar,GG,3,10331457
4,rs1045485,2.0,Reduced Risk of Breast Cancer,CC,2,202149589
...,...,...,...,...,...,...
937,rs1143646,0.0,common in complete genomics,TT,4,3148653
938,rs10509680,0.0,common on affy axiom data,GG,10,96734339
939,rs789852,0.0,common in complete genomics,CC,3,194327098
940,rs7762619,0.0,common in complete genomics,TT,6,31531310


In [29]:
genes_to_display = new_df[new_df.magnitude > 2]

In [30]:
genes_to_display

Unnamed: 0,rsid,magnitude,summary,genotype,chromosome,position
246,rs1815739,2.2,Better performing muscles. Likely sprinter.,CC,11,66328095
247,rs1815739,2.2,Better performing muscles. Likely sprinter.,CC,11,66328095
359,rs3129934,2.1,Normal lower risk of Multiple Sclerosis.,CC,6,32336187
420,rs2070744,2.1,cardiovascular differences,TT,7,150690079
424,rs53576,2.5,Optimistic and empathetic; handle stress well,GG,3,8804371
439,rs1799990,2.1,Resistance to vCJD (PrP 129 Met/Val heterozygo...,AG,20,4680251


In [31]:
print (genes_to_display.summary.values)

['Better performing muscles. Likely sprinter.'
 'Better performing muscles. Likely sprinter.'
 'Normal lower risk of Multiple Sclerosis.' 'cardiovascular differences'
 'Optimistic and empathetic; handle stress well'
 'Resistance to vCJD (PrP 129 Met/Val heterozygote), 4.6x reduced risk of sporadic CJD and 0.87x reduced risk for late-onset Alzheimer in a Caucasian population']


In [32]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
text = genes_to_display.summary.values
wordcloud = WordCloud(
    width = 800,
    height = 600,
    mode = 'RGBA',
    background_color = None,
    stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
    figsize = (90, 80),
    edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
plt.savefig("wordcloud.png")


  app.launch_new_instance()


In [33]:
img = Image.open('wordcloud.png')
img = img.convert("RGBA")

pixdata = img.load()

width, height = img.size
for y in range(height):
    for x in range(width):
        if pixdata[x, y] == (255, 255, 255, 255):
            pixdata[x, y] = (255, 255, 255, 0)

img.save("wordcloud.png", "PNG")

In [None]:
deviceId = 0
faceCascade = cv2.CascadeClassifier("cascadeFiles/haarcascade_frontalface_default.xml")
noseCascade = cv2.CascadeClassifier("reference/haarcascade_mcs_nose.xml")
eyeCascade = cv2.CascadeClassifier("cascadeFiles/haarcascade_eye.xml")
leftEyeCascade = cv2.CascadeClassifier("cascadeFiles/haarcascade_lefteye_2splits.xml")


imgHat = cv2.imread('wordcloud.png',-1)

orig_mask = imgHat[:,:,3]

orig_mask_inv = cv2.bitwise_not(orig_mask)

imgHat = imgHat[:,:,0:3]
origHatHeight, origHatWidth = imgHat.shape[:2]

cv2.namedWindow("Live Feed", 0)
cv2.setWindowProperty("Live Feed", 0, 1)

video_capture = cv2.VideoCapture(deviceId)

while(cv2.waitKey(30) != 27):
    ret, frame = video_capture.read()
    height,width,_ = frame.shape
    overlayed = frame
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)

    faces = faceCascade.detectMultiScale(
        gray,
        scaleFactor=1.1,
        minNeighbors=5,
        minSize=(30, 30),
        flags=cv2.CASCADE_SCALE_IMAGE
    )

    x = 0
    y = 0
    w = 0
    h = 0

    for (tx, ty, tw, th) in faces:
        if tw*th > w*h:
            x = tx
            y = ty
            w = tw
            h = th

    #print x
    #print y
    #print w
    #print h
    hatWidth = 0
    hatHeight = 0
    if (w != 0) and (h != 0):
        face = cv2.rectangle(frame,(x,y),(x+w,y+h),(255,0,0),2)
        hatWidth = (int)(w * 3 / 2)
        hatHeight = (int)(hatWidth * origHatHeight / origHatWidth)

        x1 = (int)(x - (hatWidth/4))
        x2 = (int)(x + w + (hatWidth/4))
        y1 = (int)(y - (hatHeight*3/4))
        y2 = (int)(y + h+ (hatHeight/4))

        if x1 < 0:
            x1 = 0
        if x2 > width:
            x2 = width
        if y1 < 0:
            y1 = 0
        if y2 > height:
            y2 = height

        hatHeight = (int)(y2 - y1)
        hatWidth = (int)(x2 - x1)

        hat = cv2.resize(imgHat, (hatWidth,hatHeight), interpolation = cv2.INTER_AREA)
        mask = cv2.resize(orig_mask, (hatWidth,hatHeight), interpolation = cv2.INTER_AREA)
        mask_inv = cv2.resize(orig_mask_inv, (hatWidth,hatHeight), interpolation = cv2.INTER_AREA)

        roi = frame[y1:y2, x1:x2]
        try:
            roi_bg = cv2.bitwise_and(roi, roi, mask=mask_inv)
            roi_fg = cv2.bitwise_and(hat, hat, mask=mask)
            dst = cv2.add(roi_bg,roi_fg)
            frame[y1:y2, x1:x2] = dst


            roi_gray_m = gray[y:y+h, x:x+w]
            roi_color_m = frame[y:y+h, x:x+w]


        finally:
            cropy1 = y + (h/2) - ((x2-x1)*(float(2)/3))
            cropy2 = y + (h/2) + ((x2-x1)*(float(2)/3))

            if cropy1 < 0:
                cropy1 = 0
            if(cropy1 >= cropy2):
                cropy1 = cropy2-1
            if cropy2 >height:
                cropy2=height
            print ("cropy2: ")
            print (cropy2)
            print ("cropy1: ")
            print (cropy1)
            print ("x1: ")
            print (x1)
            print ("x2: ")
            print (x2)
            #small = cv2.resize(frame[5:100, 5:x2], (hatWidth,hatHeight), fx=0.5, fy=0.5)
            overlayed = frame[int(cropy1):int(cropy2), x1:x2]


    cv2.imshow("Live Feed", overlayed)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break
video_capture.release()
cv2.destroyAllWindows()

In [None]:
l = Lineage()

In [None]:
Mom = l.create_individual('mom', 'mom.txt')

In [None]:
Mom

In [None]:
Mom.build

In [None]:
Me = l.create_individual('me', 'genomeeline.txt')

In [None]:
Me.build

In [None]:
discordant_snps = l.find_discordant_snps(Me, Mom, save_output=True)

In [None]:
len(discordant_snps.loc[discordant_snps['chrom'] != 'MT'])

In [None]:
results = l.find_shared_dna([Mom, Me], cM_threshold=0.75, snp_threshold=1100)

In [None]:
sorted(results.keys())

In [None]:
len(results['one_chrom_shared_dna'])

In [None]:
results1 = l.find_shared_dna([Mom, Me], shared_genes=True)

In [None]:
len(results1['two_chrom_shared_genes'])

In [None]:
from arv import load, unphased_match as match

genome = load("genomee.txt")

print("You are {gender}. You are {athletic}. you tend to {sneezesun} when it is too sunny outside. you have {color} eyes,  {hair} hair. You are likely {bodytype} and still struggles with {OCD}. you {likelyhiv}. you are {social} and {popularity}, and {organization}. You are an extremely {drive}. You are quite a {empathy}. You are {likelylynch}. You are {lactoseint}."
.format(
  gender     = "man" if genome.y_chromosome else "woman",
  athletic   = "Incredibly athletic and likely a sprinter" if genome["rs1815739"] == "CC" else "not athletic",
  sneezesun  ="Sneeze" if genome["rs10427255"] == "CC" else "not sneeze",
  social     ="Very social " if genome["rs53576"] == "AA" or "AG" else "not very social",
  hair       ="curly" if genome["rs17646946"] == "GG" else "not curly",
  bodytype   ="Muscular" if genome["rs1815739"] == "CC" else "not Muscular",
  organization ="organized" if genome["rs25532"] == "CC" or "CT" else "Not organizations",
  likelyhiv  ="Hiv resistant" if genome["i3003626"] == "DD" else "are not hiv resistant ",
  empathy     ="Very empathetic " if genome["rs53576"] == "AA" or "AG" else "not very empathetic",
  popularity ="very popular" if genome["rs53576"] == "AA" or "AG" else "not very popular",
  drive      ="very driven" if genome["rs1815739"] == "AA" else "not very driven",
  OCD        ="OCD" if genome["rs25532"] == "CC" or "CT" else "Not ocd",
  likelylynch="Have Lynch syndrome" if genome["rs63750875"] == "CC" or "CG" else "dont have lynch syndrome",
  lactoseint ="lactose intolerent" if genome["rs4988235"] == "CC"  else "not lactose intolerant",
  complexion = "light" if genome["rs1426654"] == "AA" else "dark",
  color      = match(genome["rs12913832"], {"AA": "brown",
                                            "AG": "brown or green",
                                            "GG": "blue"})))
