In [1]:
%matplotlib inline

import os
from tqdm import tqdm, tqdm_notebook
from tqdm import trange
from scipy import stats
from scipy.stats import shapiro

import pandas as pd
import numpy as np
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt

import itertools
from modules.kidera import score_positions, score_sequence

import warnings
warnings.filterwarnings('ignore')

In [None]:
import dask
import dask.dataframe as dd
from dask.distributed import Client, progress
import time
client = Client()
client

In [29]:
df = dd.read_csv('output/k-mers/human_peptides_9mers.txt', header=None)

In [12]:
len(df)

10379346

In [15]:
df.columns = ['Peptide']
df.head()

Unnamed: 0,Peptide
0,AAAAAAAAA
1,AAAAAAAAC
2,AAAAAAAAD
3,AAAAAAAAE
4,AAAAAAAAF


In [66]:
cols = ["helix.bend.pref", "side.chain.size",\
        "extended.str.pref", "hydrophobicity", "double.bend.pref", "partial.spec.vol",\
        "flat.ext.pref", "occurrence.alpha.reg", "pK.C", "surrounding.hydrop"]

In [65]:
df = df.repartition(npartitions=160)

In [75]:
df1 = df.Peptide.apply(lambda s: score_sequence(s), \
                     meta=pd.DataFrame(np.random.rand(len(df), len(cols)), columns=cols, dtype=float))
distributed = client.compute(df1)
progress(distributed)

In [76]:
res = distributed.result()

In [None]:
res.to_csv('output/kidera/hpeptides_9mers_kidera.csv', index=False)

In [79]:
res.head()

Unnamed: 0,helix.bend.pref,side.chain.size,extended.str.pref,hydrophobicity,double.bend.pref,partial.spec.vol,flat.ext.pref,occurrence.alpha.reg,pK.C,surrounding.hydrop
0,-1.56,-1.67,-0.97,-0.27,-0.93,-0.78,-0.2,-0.08,0.21,-0.48
1,-1.373333,-1.583333,-0.812222,-0.356667,-0.905556,-0.425556,-0.008889,-0.147778,0.312222,-0.304444
2,-1.322222,-1.508889,-1.037778,-0.15,-0.928889,-0.676667,-0.346667,-0.018889,0.271111,-0.348889
3,-1.547778,-1.463333,-1.041111,-0.11,-0.972222,-0.648889,-0.173333,-0.028889,0.147778,-0.44
4,-1.41,-1.375556,-0.902222,-0.398889,-0.802222,-0.783333,-0.103333,0.051111,0.376667,-0.475556


In [82]:
result = pd.read_csv('output/kidera/hpeptides_9mers_kidera.csv')
result.head()

Unnamed: 0,helix.bend.pref,side.chain.size,extended.str.pref,hydrophobicity,double.bend.pref,partial.spec.vol,flat.ext.pref,occurrence.alpha.reg,pK.C,surrounding.hydrop
0,-1.56,-1.67,-0.97,-0.27,-0.93,-0.78,-0.2,-0.08,0.21,-0.48
1,-1.373333,-1.583333,-0.812222,-0.356667,-0.905556,-0.425556,-0.008889,-0.147778,0.312222,-0.304444
2,-1.322222,-1.508889,-1.037778,-0.15,-0.928889,-0.676667,-0.346667,-0.018889,0.271111,-0.348889
3,-1.547778,-1.463333,-1.041111,-0.11,-0.972222,-0.648889,-0.173333,-0.028889,0.147778,-0.44
4,-1.41,-1.375556,-0.902222,-0.398889,-0.802222,-0.783333,-0.103333,0.051111,0.376667,-0.475556


In [88]:
df = pd.read_csv('output/k-mers/human_peptides_9mers.txt', header=None)
df.columns = ['Peptide']

In [89]:
df = pd.concat([df, result], axis=1)
df.head()

Unnamed: 0,Peptide,helix.bend.pref,side.chain.size,extended.str.pref,hydrophobicity,double.bend.pref,partial.spec.vol,flat.ext.pref,occurrence.alpha.reg,pK.C,surrounding.hydrop
0,AAAAAAAAA,-1.56,-1.67,-0.97,-0.27,-0.93,-0.78,-0.2,-0.08,0.21,-0.48
1,AAAAAAAAC,-1.373333,-1.583333,-0.812222,-0.356667,-0.905556,-0.425556,-0.008889,-0.147778,0.312222,-0.304444
2,AAAAAAAAD,-1.322222,-1.508889,-1.037778,-0.15,-0.928889,-0.676667,-0.346667,-0.018889,0.271111,-0.348889
3,AAAAAAAAE,-1.547778,-1.463333,-1.041111,-0.11,-0.972222,-0.648889,-0.173333,-0.028889,0.147778,-0.44
4,AAAAAAAAF,-1.41,-1.375556,-0.902222,-0.398889,-0.802222,-0.783333,-0.103333,0.051111,0.376667,-0.475556


In [4]:
ddf = dd.read_csv('output/kidera/hpeptides_9mers_kidera.csv')
ddf = ddf.repartition(npartitions=100)
ddf = client.persist(ddf)
progress(ddf)

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"


In [11]:
pdf = pd.read_csv('output/kidera/hpeptides_9mers_kidera.csv')
pdf.head()

Unnamed: 0,Peptide,helix.bend.pref,side.chain.size,extended.str.pref,hydrophobicity,double.bend.pref,partial.spec.vol,flat.ext.pref,occurrence.alpha.reg,pK.C,surrounding.hydrop
0,AAAAAAAAA,-1.56,-1.67,-0.97,-0.27,-0.93,-0.78,-0.2,-0.08,0.21,-0.48
1,AAAAAAAAC,-1.373333,-1.583333,-0.812222,-0.356667,-0.905556,-0.425556,-0.008889,-0.147778,0.312222,-0.304444
2,AAAAAAAAD,-1.322222,-1.508889,-1.037778,-0.15,-0.928889,-0.676667,-0.346667,-0.018889,0.271111,-0.348889
3,AAAAAAAAE,-1.547778,-1.463333,-1.041111,-0.11,-0.972222,-0.648889,-0.173333,-0.028889,0.147778,-0.44
4,AAAAAAAAF,-1.41,-1.375556,-0.902222,-0.398889,-0.802222,-0.783333,-0.103333,0.051111,0.376667,-0.475556


In [None]:
pdf.describe()

Unnamed: 0,helix.bend.pref,side.chain.size,extended.str.pref,hydrophobicity,double.bend.pref,partial.spec.vol,flat.ext.pref,occurrence.alpha.reg,pK.C,surrounding.hydrop
count,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0,10379350.0
mean,-0.04935839,-0.2163736,-0.03445414,0.1129689,-0.1011847,-0.2874578,0.0278366,-0.1101329,-0.01376097,0.0252655
std,0.4021847,0.3518546,0.3675174,0.3555814,0.3266863,0.326583,0.3630587,0.3891402,0.2983429,0.3232411
min,-1.56,-1.96,-1.61,-1.351111,-1.7,-2.05,-1.89,-2.3,-1.731111,-2.33
25%,-0.3277778,-0.4477778,-0.2833333,-0.1255556,-0.3211111,-0.5033333,-0.21,-0.3555556,-0.2088889,-0.18
50%,-0.06222222,-0.2077778,-0.03777778,0.1066667,-0.1055556,-0.2888889,0.04222222,-0.1044444,-0.007777778,0.04444444
75%,0.2133333,0.02444444,0.2122222,0.3444444,0.1144444,-0.07333333,0.2822222,0.1433333,0.1888889,0.2522222
max,2.06,1.404444,1.91,1.87,1.897778,2.052222,1.435556,2.36,1.356667,1.63


In [None]:
shapiro(pdf)

In [2]:
import findspark
findspark.init(spark_home="/home/vcvetkov/Tools/spark-2.2.1-bin-hadoop2.7/")

import pyspark
import random

In [15]:
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.1398848


In [None]:
sc = pyspark.SparkContext()

In [7]:
sqlContext = pyspark.SQLContext(sc)
sdf = sqlContext.read(source="com.databricks.spark.csv",
                      path = 'output/kidera/hpeptides_9mers_kidera.csv',
                      header = True,inferSchema = True)

TypeError: 'DataFrameReader' object is not callable