### Import library

In [1]:
# pip install -r requirements.txt

In [2]:
import ace_lib as ace
import helpful_functions as hf
import pandas as pd
import requests
import plotly.express as px



### Start session
Enter credentials once - they will be saved to local folder and loaded each time

In [3]:
s = ace.start_session()

Complete biometrics authentication and press any key to continue: 
https://api.worldquantbrain.com/authentication/persona?inquiry=inq_8X3RzkHN3tqcCwHkerhwGGyfR5Av



### Create list of alpha expressions
#### Step 1. Download datsets

In [47]:
datasets_df = hf.get_datasets(s) # by default we load all datasets USA TOP3000 delay 1
datasets_df.tail() # DataFrame.head() shows first 5 rows of the dataframe 
# print(len(datasets_df))

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,turnover,valueScore,userCount,alphaCount,fieldCount,themes,researchPapers
230,socialmedia5,Lexical Breakdown Data,Sentiment scores derived from social media. So...,socialmedia,socialmedia-social-media,USA,1,TOP3000,0.859,,1.0,52,491,12,[],[]
231,socialmedia8,Social Media Data for Equity,This is a sentiment dataset based on tweets or...,socialmedia,socialmedia-social-media,USA,1,TOP3000,0.8585,,1.0,1326,4009,2,[],[{'title': 'Research Paper 65: News Diffusion ...
232,socialmedia9,Social Media Activity Data,The dataset captures short-term estimated sent...,socialmedia,socialmedia-social-media,USA,1,TOP3000,0.8787,,3.0,10,28,4,[],[]
233,univ1,Universe Dataset,No dataset description,pv,pv-price-volume,USA,1,TOP3000,0.448,,3.0,14,41,4,[],[]
234,univ2,Universe Dataset,No dataset description,pv,pv-price-volume,USA,1,TOP3000,,,2.0,11,36,1,[],[]


In [48]:
# select needed datasets
selected_datasets_df = datasets_df[
    (datasets_df["delay"] == 1) &
    (datasets_df["coverage"] > 0.7) & (datasets_df["coverage"] <= 1) &
    (datasets_df["fieldCount"] > 0) & (datasets_df["fieldCount"] < 3000) &
    (datasets_df["region"] == 'USA') &
    (datasets_df["universe"] == 'TOP3000') &
    (datasets_df["userCount"] > 0) & (datasets_df["userCount"] < 500) &
    (datasets_df["valueScore"] > 1) & (datasets_df["valueScore"] < 20) &
    # datasets_df["name"].str.contains('news', case=False) &
    ((datasets_df["category"] == 'news') | (datasets_df["category"] == 'analyst'))
].sort_values(by=['valueScore'], ascending=False)
selected_datasets_df

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,turnover,valueScore,userCount,alphaCount,fieldCount,themes,researchPapers
143,news85,News Sentiment Analysis using DNN,The dataset derived from applying neural netwo...,news,news-news-sentiment,USA,1,TOP3000,1.0,,4.0,5,5,1,[],[]
142,news84,Headline Sentiment Analysis using DNN,The dataset derived from applying neural netwo...,news,news-news-sentiment,USA,1,TOP3000,1.0,,4.0,6,7,1,[],[]
140,news77,Earnings Call Transcript Data,"The dataset provieds a rich set of scores, tag...",news,news-news-sentiment,USA,1,TOP3000,0.8079,,4.0,20,42,30,[],[]
137,news59,Corporate Events Data,The dataset is a collection of corporate event...,news,news-news-sentiment,USA,1,TOP3000,1.0,,4.0,6,23,2,[],[]
136,news55,Intermediate News Data,The dataset provides intermediate news data th...,news,news-news,USA,1,TOP3000,1.0,,4.0,8,10,6,[],[]
133,news51,Aggregared News Data,The dataset provides aggregated news from diff...,news,news-news,USA,1,TOP3000,0.961,,4.0,9,45,19,[],[]
130,news48,Global Media News Data,The dataset provides news and information from...,news,news-news-sentiment,USA,1,TOP3000,0.9678,,4.0,21,69,22,[],[]
125,news35,Financial News Data,The dataset provides NLP model output for arti...,news,news-news,USA,1,TOP3000,1.0,,4.0,5,7,2,[],[]
126,news36,News Analytics Data,The dataset consists of real-time news analyti...,news,news-news-sentiment,USA,1,TOP3000,1.0,,4.0,15,34,14,[],[]
127,news38,News Analytic Model Data,The dataset provides datafields from news anal...,news,news-news,USA,1,TOP3000,0.8204,,3.0,32,87,50,[],[]


In [None]:
# 筛选潜在数据集
potential_datasets_df = datasets_df[
    (datasets_df["delay"] == 1) &
    (datasets_df["coverage"] > 0.8) & (datasets_df["coverage"] <= 1) &
    (datasets_df["fieldCount"] > 0) & (datasets_df["fieldCount"] < 1000) &
    (datasets_df["region"] == 'USA') &
    (datasets_df["universe"] == 'TOP3000') &
    (datasets_df["userCount"] > 0) & (datasets_df["userCount"] < 100) &
    (datasets_df["valueScore"] > 1) & (datasets_df["valueScore"] < 10)
]

For demonstration we are picking the dataset with the highest value score, this is not mandatory but definitely a recommended practice

#### Step 2. Select the needed datafields


In [59]:
dataset_id = selected_datasets_df.id.values.tolist()[0] # create a list of selected datasets ids, our list has only one element
dataset_id

'news85'

In [60]:
datafields_df = hf.get_datafields(s, dataset_id=dataset_id) # download all fields of dataset news
datafields_df.head()

Unnamed: 0,id,description,dataset,category,subcategory,region,delay,universe,type,coverage,turnover,userCount,alphaCount,themes
0,mws85_sentiment,Sentiment based on news,"{'id': 'news85', 'name': 'News Sentiment Analy...",news,news-news-sentiment,USA,1,TOP3000,VECTOR,1.0,,5,5,[]


#### Step 3. Create expression list, using selected datafields
Here is an example: creating expressions that assign weights as per the time series skewness of vector average of the field value

In [51]:
expression_list = [f'ts_skewness(vec_avg({x}),120)' for x in datafields_df.id.values.tolist()] # create a list of alpha expressions ts_skewness(vec_avg({x}),120) where x is a datafield id

#### Step 4. Apply generate_alpha function to the expression list
In generate alpha function you can specify region, universe, decay, delay and other simulation settings

In [54]:
#when you send multiple alphas for simulation, please make sure all alphas of a single list should have common settings
#alphas with different settings should be sent in a different list, for instance below list has all alphas with same settings

alpha_list = [ace.generate_alpha(x, region= "USA", universe = "TOP3000",) for x in expression_list]
print(len(alpha_list))
# alpha_list[0]

1


This is an example - how alpha actually looks like when you send it to the platform.

### Simulate alpha list, get simulation result

simulate_alpha_list_multi will do a multi-simulation if list of alphas is greater than 10, which is the case here

the returned object will contain simulation results for all alphas as a list

In [53]:
#alpha expressions are sliced to first 10 for demonstration purpose

result = ace.simulate_alpha_list_multi(s, alpha_list[:20])



100%|██████████| 1/1 [00:43<00:00, 43.39s/it]


In [55]:
#we received a list of 10 simulation results since we sent in alpha_list[:10]
len(result)

1

__Accessing the result of the first alpha, let's take a look at all the keys of this dictionary__

In [56]:
result[0].keys()

dict_keys(['alpha_id', 'simulate_data', 'is_stats', 'pnl', 'stats', 'is_tests', 'train', 'test'])

In [57]:
#is_stats key will return a dataframe storing IS summary results

result[0]['is_stats']

Unnamed: 0,pnl,bookSize,longCount,shortCount,turnover,returns,drawdown,margin,fitness,sharpe,startDate,alpha_id
0,-523426,20000000,1395,1645,0.0812,-0.0052,0.115,-0.000129,-0.04,-0.19,2012-01-22,LpPoWne


In [58]:
#prettify_result function can be used from the helpful_functions library to take a look at IS stats of all the simulated alphas

result_st1 = hf.prettify_result(result, detailed_tests_view=False)
result_st1

Unnamed: 0,pnl,book_size,long_count,short_count,turnover,returns,drawdown,margin,fitness,sharpe,...,expression,concentrated_weight,high_turnover,is_ladder_sharpe,low_fitness,low_sharpe,low_sub_universe_sharpe,low_turnover,matches_competition,matches_themes
0,-523426,20000000,1395,1645,0.0812,-0.0052,0.115,-0.000129,-0.04,-0.19,...,"ts_skewness(vec_avg(mws84_sentiment),120)",PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING


### Visualise pnl of an alpha

In [30]:
alpha_pnl = ace.get_alpha_pnl(s, result_st1['alpha_id'][0])
px.line(x = alpha_pnl.index, y = alpha_pnl.Pnl, title=f'<b>alpha_id={hf.make_clickable_alpha_id(alpha_pnl.alpha_id[0])}</b>')\
    .update_layout(xaxis_title="Date", yaxis_title="Pnl", title_x=0.5)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



### Select prospect alphas, that are worth improving (in your opinion)
In this example we are selecting alphas with high fitness, to resimulate it with improved logic

In [31]:
prospect_alphas = result_st1.loc[lambda x: x.fitness>0.2]['expression'].values
prospect_alphas

array(['ts_skewness(vec_avg(mws36_novelty_oldest_span),120)',
       'ts_skewness(vec_avg(mws36_relevance),120)',
       'ts_skewness(vec_avg(mws36_sentiment_phrase_positive),120)',
       'ts_skewness(vec_avg(mws36_sentiment_positive_confidence),120)',
       'ts_skewness(vec_avg(mws36_novelty),120)',
       'ts_skewness(vec_avg(mws36_novelty_newest_span),120)'],
      dtype=object)

### Change the expression - what would you do to improve alpha's results?
<br>Use your own logic to improve the alphas.</br>
<br>Here we will apply **group_rank** to overcome poor weight distribution.</br>

In [32]:

new_expression_list = ['group_rank(' + a + ',sector)' for a in prospect_alphas]
new_expression_list

['group_rank(ts_skewness(vec_avg(mws36_novelty_oldest_span),120),sector)',
 'group_rank(ts_skewness(vec_avg(mws36_relevance),120),sector)',
 'group_rank(ts_skewness(vec_avg(mws36_sentiment_phrase_positive),120),sector)',
 'group_rank(ts_skewness(vec_avg(mws36_sentiment_positive_confidence),120),sector)',
 'group_rank(ts_skewness(vec_avg(mws36_novelty),120),sector)',
 'group_rank(ts_skewness(vec_avg(mws36_novelty_newest_span),120),sector)']

In [33]:
#generating new simulation data for new expressions

new_alpha_list = [ace.generate_alpha(x, region= "USA", universe = "TOP3000",) for x in new_expression_list]

### Re-simulation

In [34]:
new_result = ace.simulate_alpha_list_multi(s, new_alpha_list)



100%|██████████| 6/6 [01:56<00:00, 19.47s/it]


In [35]:
result_st2 = hf.prettify_result(new_result, clickable_alpha_id=False)
result_st2

Unnamed: 0,pnl,book_size,long_count,short_count,turnover,returns,drawdown,margin,fitness,sharpe,...,expression,concentrated_weight,high_turnover,is_ladder_sharpe,low_fitness,low_sharpe,low_sub_universe_sharpe,low_turnover,matches_competition,matches_themes
0,894783,20000000,1037,1036,0.0928,0.0223,0.0544,0.00048,0.32,0.75,...,group_rank(ts_skewness(vec_avg(mws36_relevance...,FAIL,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING
1,1934904,20000000,1505,1505,0.0745,0.0194,0.0301,0.00052,0.31,0.78,...,"group_rank(ts_skewness(vec_avg(mws36_novelty),...",PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING
2,1862981,20000000,1305,1296,0.0749,0.0187,0.0637,0.000498,0.27,0.7,...,group_rank(ts_skewness(vec_avg(mws36_novelty_o...,PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING
3,1980504,20000000,1305,1296,0.0743,0.0198,0.0839,0.000534,0.27,0.68,...,group_rank(ts_skewness(vec_avg(mws36_novelty_n...,PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING
4,1472345,20000000,1546,1543,0.0658,0.0147,0.0438,0.000448,0.2,0.57,...,group_rank(ts_skewness(vec_avg(mws36_sentiment...,PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,WARNING,WARNING
5,725887,20000000,1543,1542,0.0664,0.0073,0.0831,0.000219,0.07,0.29,...,group_rank(ts_skewness(vec_avg(mws36_sentiment...,PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,WARNING,WARNING


#### Compare alphas stats before and after changes:

In [36]:
## We will join before and after dataframes by field

result_st1['field'] = result_st1['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()
result_st2['field'] = result_st2['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()

In [37]:
#merging results before and after improvement

compare_results = pd.merge(result_st1, result_st2, on='field', suffixes=('_before', '_after'))

In [38]:
#selecting column list returned by merging the two alpha results

col_list = ['fitness_before', 'fitness_after','sharpe_before', 'sharpe_after', 'alpha_id_before', 'alpha_id_after',
                 'expression_before', 'expression_after']

compare_results[col_list]\
    .style.format({'alpha_id_before': hf.make_clickable_alpha_id, 'alpha_id_after': hf.make_clickable_alpha_id})

Unnamed: 0,fitness_before,fitness_after,sharpe_before,sharpe_after,alpha_id_before,alpha_id_after,expression_before,expression_after
0,0.34,0.27,0.4,0.7,ng155pa,8zmx0AV,"ts_skewness(vec_avg(mws36_novelty_oldest_span),120)","group_rank(ts_skewness(vec_avg(mws36_novelty_oldest_span),120),sector)"
1,0.33,0.32,0.76,0.75,Vpvbb1w,Gwq52J5,"ts_skewness(vec_avg(mws36_relevance),120)","group_rank(ts_skewness(vec_avg(mws36_relevance),120),sector)"
2,0.3,0.07,0.36,0.29,p6qeemb,m8PWG26,"ts_skewness(vec_avg(mws36_sentiment_phrase_positive),120)","group_rank(ts_skewness(vec_avg(mws36_sentiment_phrase_positive),120),sector)"
3,0.26,0.2,0.67,0.57,LpPYYj6,Gwq52k0,"ts_skewness(vec_avg(mws36_sentiment_positive_confidence),120)","group_rank(ts_skewness(vec_avg(mws36_sentiment_positive_confidence),120),sector)"
4,0.25,0.31,0.66,0.78,eOlZZmN,EwZPjG1,"ts_skewness(vec_avg(mws36_novelty),120)","group_rank(ts_skewness(vec_avg(mws36_novelty),120),sector)"
5,0.21,0.27,0.29,0.68,m8PWW39,LpPYvEe,"ts_skewness(vec_avg(mws36_novelty_newest_span),120)","group_rank(ts_skewness(vec_avg(mws36_novelty_newest_span),120),sector)"


### Check merged alpha performance

run the following code to do a before and after comparison of your merged pool post alpha submission

In [39]:
performance_comparison = ace.performance_comparison(s, result_st2['alpha_id'][0])

### How to submit?

Create a list of submittable alphas - alphas that have no FAIL in is_tests

In [40]:
#to take a look at the combined result of all new alphas

is_tests_df = hf.concat_is_tests(new_result)
is_tests_df.head()

Unnamed: 0,alpha_id,date,endDate,limit,matched,name,result,startDate,themes,unmatched,value,year
0,Gwq52J5,,,1.58,,LOW_SHARPE,FAIL,,,,0.75,
1,Gwq52J5,,,1.0,,LOW_FITNESS,FAIL,,,,0.32,
2,Gwq52J5,,,0.01,,LOW_TURNOVER,PASS,,,,0.0928,
3,Gwq52J5,,,0.7,,HIGH_TURNOVER,PASS,,,,0.0928,
4,Gwq52J5,2017-02-22,,0.1,,CONCENTRATED_WEIGHT,FAIL,,,,0.174242,


In [41]:
#making a list of failed alphas
failed_alphas = is_tests_df.query('result=="FAIL"')['alpha_id'].unique()

#making a list of passed alphas
passed_alphas = list(set(is_tests_df['alpha_id']).difference(failed_alphas))

print(f'Failed alphas:{failed_alphas}\nPassed alphas:{passed_alphas}')

Failed alphas:['Gwq52J5' '8zmx0AV' 'm8PWG26' 'Gwq52k0' 'EwZPjG1' 'LpPYvEe']
Passed alphas:[]


When you got a list of submittable alphas, you can call function submit_alpha()

In [42]:
#calling submit_alpha on all alphas that have passed the submission tests

submit_result = {alpha_id: submit_alpha(s, alpha_id) for alpha_id in passed_alphas}

In [None]:
#submit_result will have return values from the submit_result function

submit_result

{}

### Library Fuctions.

following are some other functions that you can use for your own analysis

**get_alpha_pnl(s, alpha_id)** - to get the pnl for an alpha

**get_alpha_yearly_stats(s, alpha_id)** - to get yearly statistics for an alpha

**get_self_corr(s, alpha_id)** - to get self correlation results for an alpha

**get_prod_corr(s, alpha_id)** - to get prod correlation results for an alpha

**get_check_submission(s, alpha_id)** - to get check submission result for an alpha

**check_self_corr_test(s, alpha_id)** - to check if alpha passes self correlation test (self_corr<0.7)

**check_prod_corr_test(s, alpha_id)** - to check if alpha passes prod correlation test (prod_corr<0.7)

**perfomance_comparison(s, alpha_id)** - to get the result of performance comparison for an alpha merged performance