# overview
* basic python
* accessing data
* storing data in database
* manipulating dataframes
* visualizing data
* split-apply-combine
* heatmaps
* simple multilable classifier

## basic python

In [None]:
# import some libraries
import pandas as pd # dataframes
import seaborn as sns # visualization
%pylab inline

In [None]:
# explain variables and dynamic types in python
a=1
print(a)
a="eins"
print(a)

In [None]:
# explain list / set
mylist = [1,1,2,2,3,4,5,'sechs','sieben',0x08]
print(mylist)
myset = set(mylist)
print(myset)

In [None]:
# explain dictionary
mydict = dict()
mydict['Andreas Steffen']=1
mydict['Djork-Arne']=5
print(mydict)
mydict = {'Andreas Steffen': 1, 'Djork-Arne': 5}
print(mydict)

In [None]:
# show a for loop
for ele in mylist:
    print(ele)

In [None]:
# show list comprehension
[ele for ele in mylist if isinstance(ele, int)]

In [None]:
# import the random lib
import random

In [None]:
# a simple function
def random_int(max_int):
    ret_val = random.randint(0, max_int)
    return ret_val

In [None]:
random_int(99)

In [None]:
# explain dataframes
df = pd.DataFrame([random_int(100) for _ in range(10)])
print(df)
df = pd.DataFrame(columns = ['col1','col2'], data = [[1,'eins'],[2,'zwei']])
print(df)

In [None]:
print(df.dtypes)
df.describe()

# enough basics, let's do some data science

In [None]:
# read in the data expression. it's in the data folder
data = pd.read_table('../data/expression.tsv')

In [None]:
# let's inspect
data.head()

In [None]:
# write data into a sqlite db
import sqlite3 as lite
con = lite.connect('../data/expression.db')
data.to_sql('expression',if_exists = 'replace', index = None, con = con)

In [None]:
# and read it back
data1 = pd.read_sql('SELECT * FROM expression', con)

In [None]:
data1.head()

In [None]:
# show indexing
data.loc[[1,2],:]

In [None]:
data.loc[[1,2], ['title','plot']]

In [None]:
# only selected columns
data[['title','plot']]

In [None]:
# let's extract the year
data['year'] = data.title.str.extract('\((\d{4})', expand = True)[0]
#data['year'] = data.title.str.extract('\((\d{4})(/[IVXD]{1,3})?\)', expand = True)[0]

In [None]:
data.year.unique()

In [None]:
# find NA years
data[pd.isnull(data.year)]

In [None]:
# drop the ones without a year
data = data.dropna(subset = ['year'])

In [None]:
# drop duplicates by year and plot
data = data.drop_duplicates(['year', 'plot'])

In [None]:
# plot by year
plt.figure(figsize = (20,9))
data.year.value_counts().sort_index().plot.bar()
sns.despine()

In [None]:
!pip3 install langdetect

In [None]:
# only english
from langdetect import detect

In [None]:
print(detect("My name is Andreas Steffen"))

In [None]:
print(detect("Ich heisse Andreas Steffen"))

In [None]:
# show for top 100
[detect(plot) for plot in data['plot'].head(100).tolist()]

In [None]:
# do for all takes to long
data['language'] = [detect(plot) for plot in data['plot'].tolist()]

In [None]:
# read in language 
#data.to_csv('../data/movies_genres.with_year_lang.tsv', sep = '\t')
data = pd.read_table('../data/movies_genres.with_year_lang.tsv', sep = '\t',index_col = 0)
#data.shape

In [None]:
# show stats
data.language.value_counts()

In [None]:
# check some languages
data.query("language == 'it'")['plot'].tolist()

In [None]:
data = data.query("language == 'en'")

In [None]:
# get genres
genres = data.filter(regex = "[A-Z]", axis = 1).columns.tolist()

In [None]:
genres

In [None]:
# plot genre occurence
plt.figure(figsize = (20,9))
data[genres].sum().plot.bar()
sns.despine()

In [None]:
# melt the data
data_melt = pd.melt(data,id_vars = ['title', 'plot'], value_vars = genres, var_name = 'genre')

In [None]:
# remove zeros
data_melt = data_melt[data_melt.value!=0]

In [None]:
# calculate average number of chars in plot description per genre and plot
plt.figure(figsize = (20,9))
data_melt.groupby('genre').apply(lambda x: x['plot'].str.len().mean()).plot.bar()
sns.despine()

In [None]:
# calculate the plot string len
data_melt['plot_string_len'] = data_melt['plot'].str.len()

In [None]:
# but let's use a boxplot
plt.figure(figsize=(20,9))
g = sns.boxplot(data = data_melt, x= 'genre', y= 'plot_string_len', color = '#dd00aa')
g.set( yscale="log")
plt.xticks(rotation = 90)
sns.despine()

In [None]:
# inspect strongst outlier
data_melt.sort_values('plot_string_len', ascending = False).head(1)['plot'].values

In [None]:
# calculate similarity of genres
from scipy.spatial import distance as d

In [None]:
# filter the genre subset
data_genres = data.filter(regex = '[A-Z]', axis = 1)

In [None]:
data_genres.head()

In [None]:
# plot the similarity heatmap
res = 1-d.squareform(d.pdist(data_genres.T, 'jaccard'))
plt.figure(figsize = (10,10))
sims = pd.DataFrame(res, index=data_genres.columns, columns= data_genres.columns)
sns.heatmap(sims)

# let's do some descriptive stats, man!

In [None]:
# ttest

In [1]:
# xkcd multiple testing

In [2]:
# multiple testing

In [None]:
# correlation

# how about training a predictive biomarker?

In [None]:
# import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVCxm

In [None]:
# transformation and training pipeline
tfidf = TfidfVectorizer()
classifier = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(RandomForestClassifier()))])

In [None]:
# just train on the top 10000 in the interest of time
train_x = data['plot'].head(1000)

In [None]:
# convert to a matrix format (just because sklearn needs it)
train_y = data_genres.head(1000).as_matrix()

In [None]:
# fit the model
classifier.fit(train_x, train_y)

In [None]:
# let's fetch some movie descriptions from itunes
actor = 'Cate Blanchett'.replace(' ','+')
test = pd.read_json('https://itunes.apple.com/search?term={actor}&entity=movie&limit=200'.format(actor = actor))

In [None]:
# some tweaking of the input
test_extract = test.results.apply(lambda x: pd.Series({'plot': x['longDescription'],'title':x['trackName']}) )

In [None]:
test_extract

In [None]:
# predict probabilities
predicted = classifier.predict_proba(test_extract['plot'])

In [None]:
# convert to dataframe
predicted = pd.DataFrame(predicted, columns = data_genres.columns, index = test_extract.title)

In [None]:
# and plot
plt.figure(figsize=(20,20))
sns.heatmap(predicted)

In [None]:
from IPython.display import Javascript
#runs arbitrary javascript, client-side
Javascript("""
           window.datatable={};
           console.log('test');
           """.format(predicted.reset_index().to_json(orient='records')))

In [None]:
predicted.reset_index()

In [None]:
%%javascript
element.append("<div id = 'lineuptable'></div>")
require(['./LineUpJS_bundle.min.js'],function(LineUpJS){
    const arr = window.datatable
    console.log(arr);
    const desc = [
              {label: 'title', type: 'string', column: 'title'},
              {label: 'Drama', type: 'number', column: 'Drama', 'domain': [0, 1]},
              {label: 'Romance', type: 'number', column: 'Romance', 'domain': [0, 1]}
     ]

    var colors = d3.scale.category10();
    console.log(colors);
    desc.forEach(function (d, i) {
      d.color = colors('' + i);
    });
    
    const p = new LineUpJS.provider.LocalDataProvider(arr, desc);    
     {
      const r = p.pushRanking();
         r.push(p.create(desc[0]));
          r.push((function () {
        const rstack = p.create(LineUpJS.model.createStackDesc('Stack'));
        rstack.push(p.create(desc[1]));
        rstack.push(p.create(desc[2]));
     
        rstack.setWeights([1, 1]);
        return rstack;
      })());
    
    }
    const instance = LineUpJS.create(p, document.getElementById('lineuptable'), {
     
    });
    instance.update();
});
