### Import library

In [248]:
!pip install -r requirements.txt



In [249]:
import ace_lib as ace
import helpful_functions as hf
import pandas as pd
import requests
import plotly.express as px

### Start session
Enter credentials once - they will be saved to local folder and loaded each time

In [250]:
s = ace.start_session()

### Create list of alpha expressions
#### Step 1. Download datsets

In [251]:

datasets_df = hf.get_datasets(s) # by default we load all datasets USA TOP3000 delay 1
datasets_df.head(120) # DataFrame.head() shows first 5 rows of the dataframe

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,valueScore,userCount,alphaCount,fieldCount,pyramidMultiplier,themes,researchPapers
0,analyst11,ESG scores,Environmental Social Governance scores that ex...,"{'id': 'analyst', 'name': 'Analyst'}","{'id': 'analyst-esg', 'name': 'ESG'}",USA,1,TOP3000,0.6818,2.0,267,3444,197,1.1,[],"[{'title': 'Research Paper 19: ESG Preference,..."
1,analyst12,Social Media Relation Dataset,Social media relation dataset,"{'id': 'analyst', 'name': 'Analyst'}","{'id': 'analyst-crowdsourced-estimates', 'name...",USA,1,TOP3000,0.6178,5.0,14,23,12,1.1,[],[]
2,analyst14,Estimations of Key Fundamentals,This dataset reports many items from financial...,"{'id': 'analyst', 'name': 'Analyst'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",USA,1,TOP3000,0.6203,2.0,590,37643,868,1.1,[],[{'title': 'Research Paper 10: Investor Learni...
3,analyst15,Earnings forecasts,This dataset provides bottom-up forecast data ...,"{'id': 'analyst', 'name': 'Analyst'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",USA,1,TOP3000,0.9892,2.0,681,69878,2538,1.1,[],[]
4,analyst16,Real Time Estimates,This dataset provides real-time access to the ...,"{'id': 'analyst', 'name': 'Analyst'}","{'id': 'analyst-crowdsourced-estimates', 'name...",USA,1,TOP3000,0.8252,2.0,165,2633,107,1.1,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115,model176,Non-Financial Metric Models,Ratings that quantify non-financial statement ...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-consumer-models', 'name': 'Consu...",USA,1,TOP3000,0.4978,3.0,73,375,23,1.2,[],[]
116,model182,Geographic network based model,Price momentum scores based on different model...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-momentum-models', 'name': 'Momen...",USA,1,TOP3000,0.5777,3.0,35,147,8,1.2,[],[]
117,model194,North America CDS Factor Model,Financial derivative that transfers credit ris...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-risk-models', 'name': 'Risk Mode...",USA,1,TOP3000,1.0000,2.0,126,1707,105,1.2,[],[]
118,model20,Fundamental & Technical Rank Model,This dataset weighs fundamental and technical ...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-technical-models', 'name': 'Tech...",USA,1,TOP3000,0.4582,2.0,25,169,1,1.2,[],[]


In [252]:
# select needed datasets
selected_datasets_df = datasets_df[
    (datasets_df["delay"] == 1) &
    (datasets_df["coverage"] > 0.8) & (datasets_df["coverage"] <= 1) &
    (datasets_df["fieldCount"] > 0) & (datasets_df["fieldCount"] < 1000) &
    (datasets_df["region"] == 'USA') &
    (datasets_df["universe"] == 'TOP3000') &
    (datasets_df["userCount"] > 0) & (datasets_df["userCount"] < 100) &
    (datasets_df["valueScore"] > 1) & (datasets_df["valueScore"] < 100) #&
   #datasets_df["name"].str.contains('news', case=False) &
    #((datasets_df["category"] == 'news'))|((datasets_df["category"] == 'analyst'))
].sort_values(by=['valueScore'], ascending=False)
selected_datasets_df

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,valueScore,userCount,alphaCount,fieldCount,pyramidMultiplier,themes,researchPapers
129,model238,SmartHoldings Model,This dataset is a global stock selection model...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-estimates-models', 'name': 'Esti...",USA,1,TOP3000,0.8198,7.0,1,1,11,1.2,[],[]
276,other631,Option Trade Activity Data,The dataset provides End-of-Day and Intra-Day ...,"{'id': 'other', 'name': 'Other'}","{'id': 'other-etf-models', 'name': 'ETF Models'}",USA,1,TOP3000,0.8297,7.0,11,16,171,1.3,[],[]
282,other733,Classification based on Business description,Classifying stocks on the basis of the their a...,"{'id': 'other', 'name': 'Other'}","{'id': 'other-analyst-models', 'name': 'Analys...",USA,1,TOP3000,0.8542,7.0,1,3,10,1.3,[],[]
32,earnings14,Upcoming Earnings Data,This particular dataset deals with earnings re...,"{'id': 'earnings', 'name': 'Earnings'}","{'id': 'earnings-earnings-estimates', 'name': ...",USA,1,TOP3000,0.8313,6.0,3,5,9,1.2,[],[]
89,macro52,ETF Constituents,Historical and Daily Holdings information of ETFs,"{'id': 'macro', 'name': 'Macro'}","{'id': 'macro-macroeconomic-activities', 'name...",USA,1,TOP3000,1.0000,6.0,3,5,3,1.0,[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,other193,Systematic Hedging for Investors to Evade Larg...,Machine learning based stock selection model. ...,"{'id': 'model', 'name': 'Model'}","{'id': 'model-mlai-models', 'name': 'ML/AI Mod...",USA,1,TOP3000,0.8845,2.0,98,520,3,1.2,[],[]
308,risk60,Securities Lending Insight Data,"The dataset provides list of metrics, covering...","{'id': 'risk', 'name': 'Risk'}","{'id': 'risk-risk-models', 'name': 'Risk Models'}",USA,1,TOP3000,0.9879,2.0,39,161,4,1.4,[],[]
290,pv104,Market Microstructure Data,The dataset is perfect to add extra flavor to ...,"{'id': 'pv', 'name': 'Price Volume'}","{'id': 'pv-price-volume', 'name': 'Price Volume'}",USA,1,TOP3000,0.9633,2.0,74,482,45,1.0,[],[]
338,socialmedia3,Twitter based sentiment data,Sentiment scores derived from social media. So...,"{'id': 'socialmedia', 'name': 'Social Media'}","{'id': 'socialmedia-social-media', 'name': 'So...",USA,1,TOP3000,0.9108,2.0,35,94,7,1.1,[],[{'title': 'Research Paper 45: Online reviews ...


For demonstration we are picking the dataset with the highest value score, this is not mandatory but definitely a recommended practice

#### Step 2. Select the needed datafields


In [253]:
dataset_id = selected_datasets_df.id.values.tolist()[58] # create a list of selected datasets ids, our list has only one element
dataset_id

'news38'

In [254]:
datafields_df = hf.get_datafields(s, dataset_id=dataset_id) # download all fields of dataset news
datafields_df.head(25000)

Unnamed: 0,id,description,dataset,category,subcategory,region,delay,universe,type,coverage,userCount,alphaCount,pyramidMultiplier,themes
0,mws38_action,News action,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.9457,3,8,1.0,[]
1,mws38_entitlement,News entitlement ID (EID),"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.8503,4,5,1.0,[]
2,mws38_headlinetones_entitlement,News entitlement ID (EID),"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.8615,1,1,1.0,[]
3,mws38_headlinetones_sg_entitlement,News entitlement ID (EID),"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.8501,2,6,1.0,[]
4,mws38_headlinetones_sg_time,News time,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.85,1,2,1.0,[]
5,mws38_headlinetones_tc_entitlement,News entitlement ID (EID),"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.8501,0,0,1.0,[]
6,mws38_headlinetones_tc_time,News time,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.85,2,3,1.0,[]
7,mws38_headlinetones_time,News time,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.8503,0,0,1.0,[]
8,mws38_negative_freq,Negative frequency,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.6477,1,1,1.0,[]
9,mws38_negative_score,Negative score,"{'id': 'news38', 'name': 'News Analytic Model ...","{'id': 'news', 'name': 'News'}","{'id': 'news-news', 'name': 'News'}",USA,1,TOP3000,VECTOR,0.6477,1,1,1.0,[]


#### Step 3. Create expression list, using selected datafields
Here is an example: creating expressions that assign weights as per the time series skewness of vector average of the field value

In [255]:
expression_list = [f'ts_skewness(vec_avg({x}),120)' for x in datafields_df.id.values.tolist()] # create a list of alpha expressions ts_skewness(vec_avg({x}),120) where x is a datafield id

#### Step 4. Apply generate_alpha function to the expression list
In generate alpha function you can specify region, universe, decay, delay and other simulation settings

In [256]:
#when you send multiple alphas for simulation, please make sure all alphas of a single list should have common settings
#alphas with different settings should be sent in a different list, for instance below list has all alphas with same settings

alpha_list = [ace.generate_alpha(x, region= "USA", universe = "TOP3000",) for x in expression_list]

alpha_list[20]

{'type': 'REGULAR',
 'settings': {'instrumentType': 'EQUITY',
  'region': 'USA',
  'universe': 'TOP3000',
  'delay': 1,
  'decay': 0,
  'neutralization': 'INDUSTRY',
  'truncation': 0.08,
  'pasteurization': 'ON',
  'testPeriod': 'P0Y0M0D',
  'unitHandling': 'VERIFY',
  'nanHandling': 'OFF',
  'language': 'FASTEXPR',
  'visualization': False},
 'regular': 'ts_skewness(vec_avg(mws38_storyanalytics_score),120)'}

This is an example - how alpha actually looks like when you send it to the platform.

### Simulate alpha list, get simulation result

simulate_alpha_list_multi will do a multi-simulation if list of alphas is greater than 10, which is the case here

the returned object will contain simulation results for all alphas as a list

In [257]:
#alpha expressions are sliced to first 10 for demonstration purpose

result = ace.simulate_alpha_list_multi(s, alpha_list[:10])

100%|██████████| 4/4 [02:43<00:00, 40.91s/it]


In [258]:
#we received a list of 10 simulation results since we sent in alpha_list[:10]
len(result)

10

__Accessing the result of the first alpha, let's take a look at all the keys of this dictionary__

In [259]:
result[1].keys()

dict_keys(['alpha_id', 'simulate_data', 'is_stats', 'pnl', 'stats', 'is_tests', 'train', 'test'])

In [260]:
#is_stats key will return a dataframe storing IS summary results

result[0]['is_stats']

Unnamed: 0,pnl,bookSize,longCount,shortCount,turnover,returns,drawdown,margin,fitness,sharpe,startDate,alpha_id
0,-2146626,20000000,1466,1338,0.0487,-0.0215,0.2919,-0.000883,-0.18,-0.43,2012-07-15,6e6rX5p


In [261]:
#prettify_result function can be used from the helpful_functions library to take a look at IS stats of all the simulated alphas

result_st1 = hf.prettify_result(result, detailed_tests_view=False)
result_st1

Unnamed: 0,pnl,book_size,long_count,short_count,turnover,returns,drawdown,margin,fitness,sharpe,...,alpha_id,expression,concentrated_weight,high_turnover,low_2y_sharpe,low_fitness,low_sharpe,low_sub_universe_sharpe,low_turnover,matches_pyramid
0,7026382,20000000,1214,1487,0.0845,0.0704,0.6027,0.001666,0.23,0.3,...,vv7kwJw,"ts_skewness(vec_avg(mws38_negative_freq),120)",FAIL,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS
1,1207449,20000000,1288,1131,0.0574,0.0121,0.0646,0.000422,0.12,0.37,...,X8ZWQbx,"ts_skewness(vec_avg(mws38_negative_score),120)",PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,PASS
2,759492,20000000,1426,1378,0.0595,0.0077,0.1485,0.000258,0.04,0.16,...,R1xVjrd,"ts_skewness(vec_avg(mws38_action),120)",PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,PASS
3,-2146626,20000000,1466,1338,0.0487,-0.0215,0.2919,-0.000883,-0.18,-0.43,...,6e6rX5p,ts_skewness(vec_avg(mws38_headlinetones_sg_ent...,PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS
4,-2146626,20000000,1466,1338,0.0487,-0.0215,0.2919,-0.000883,-0.18,-0.43,...,X8ZWmjz,ts_skewness(vec_avg(mws38_headlinetones_tc_ent...,PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS
5,-1764477,20000000,1454,1350,0.062,-0.0177,0.2241,-0.00057,-0.24,-0.65,...,MLv1M88,"ts_skewness(vec_avg(mws38_headlinetones_time),...",PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,PASS
6,-2778545,20000000,1422,1382,0.0484,-0.0278,0.332,-0.001151,-0.27,-0.58,...,dRJOkxK,ts_skewness(vec_avg(mws38_headlinetones_entitl...,PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS
7,-2778545,20000000,1422,1382,0.0484,-0.0278,0.332,-0.001151,-0.27,-0.58,...,5kKQnwn,"ts_skewness(vec_avg(mws38_entitlement),120)",PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS
8,-2004492,20000000,1492,1312,0.0614,-0.0201,0.2639,-0.000654,-0.29,-0.73,...,vv7kwK3,ts_skewness(vec_avg(mws38_headlinetones_sg_tim...,PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,PASS
9,-2004492,20000000,1492,1312,0.0614,-0.0201,0.2639,-0.000654,-0.29,-0.73,...,NREQx6g,ts_skewness(vec_avg(mws38_headlinetones_tc_tim...,PASS,PASS,FAIL,FAIL,FAIL,PASS,PASS,PASS


### Visualise pnl of an alpha

In [262]:
alpha_pnl = ace.get_alpha_pnl(s, result_st1['alpha_id'][0])
px.line(x = alpha_pnl.index, y = alpha_pnl.Pnl, title=f'<b>alpha_id={hf.make_clickable_alpha_id(alpha_pnl.alpha_id[0])}</b>')\
    .update_layout(xaxis_title="Date", yaxis_title="Pnl", title_x=0.5)


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



### Select prospect alphas, that are worth improving (in your opinion)
In this example we are selecting alphas with high fitness, to resimulate it with improved logic

In [263]:
prospect_alphas = result_st1.loc[lambda x: x.fitness>0.2]['expression'].values
prospect_alphas

array(['ts_skewness(vec_avg(mws38_negative_freq),120)'], dtype=object)

### Change the expression - what would you do to improve alpha's results?
<br>Use your own logic to improve the alphas.</br>
<br>Here we will apply **group_rank** to overcome poor weight distribution.</br>

In [264]:

new_expression_list = ['group_rank(' + a + ',sector)' for a in prospect_alphas]
new_expression_list

['group_rank(ts_skewness(vec_avg(mws38_negative_freq),120),sector)']

In [265]:
#generating new simulation data for new expressions

new_alpha_list = [ace.generate_alpha(x, region= "USA", universe = "TOP3000",) for x in new_expression_list]

### Re-simulation

In [266]:
new_result = ace.simulate_alpha_list_multi(s, new_alpha_list)



100%|██████████| 1/1 [01:37<00:00, 97.35s/it]


In [267]:
result_st2 = hf.prettify_result(new_result, clickable_alpha_id=False)
result_st2

Unnamed: 0,pnl,book_size,long_count,short_count,turnover,returns,drawdown,margin,fitness,sharpe,...,alpha_id,expression,concentrated_weight,high_turnover,low_2y_sharpe,low_fitness,low_sharpe,low_sub_universe_sharpe,low_turnover,matches_pyramid
0,-222879,20000000,1342,1358,0.0568,-0.0022,0.1052,-7.9e-05,-0.01,-0.07,...,Q9R3emG,group_rank(ts_skewness(vec_avg(mws38_negative_...,PASS,PASS,FAIL,FAIL,FAIL,FAIL,PASS,PASS


#### Compare alphas stats before and after changes:

In [268]:
## We will join before and after dataframes by field

result_st1['field'] = result_st1['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()
result_st2['field'] = result_st2['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()

In [269]:
#merging results before and after improvement

compare_results = pd.merge(result_st1, result_st2, on='field', suffixes=('_before', '_after'))

In [270]:
#selecting column list returned by merging the two alpha results

col_list = ['fitness_before', 'fitness_after','sharpe_before', 'sharpe_after', 'alpha_id_before', 'alpha_id_after',
                 'expression_before', 'expression_after']

compare_results[col_list]\
    .style.format({'alpha_id_before': hf.make_clickable_alpha_id, 'alpha_id_after': hf.make_clickable_alpha_id})

Unnamed: 0,fitness_before,fitness_after,sharpe_before,sharpe_after,alpha_id_before,alpha_id_after,expression_before,expression_after
0,0.23,-0.01,0.3,-0.07,vv7kwJw,Q9R3emG,"ts_skewness(vec_avg(mws38_negative_freq),120)","group_rank(ts_skewness(vec_avg(mws38_negative_freq),120),sector)"


### Check merged alpha performance

run the following code to do a before and after comparison of your merged pool post alpha submission

In [271]:
performance_comparison = ace.performance_comparison(s, result_st2['alpha_id'][0])

### How to submit?

Create a list of submittable alphas - alphas that have no FAIL in is_tests

In [272]:
#to take a look at the combined result of all new alphas

is_tests_df = hf.concat_is_tests(new_result)
is_tests_df.head()

Unnamed: 0,alpha_id,limit,multiplier,name,pyramids,result,value
0,Q9R3emG,1.58,,LOW_SHARPE,,FAIL,-0.07
1,Q9R3emG,1.0,,LOW_FITNESS,,FAIL,-0.01
2,Q9R3emG,0.01,,LOW_TURNOVER,,PASS,0.0568
3,Q9R3emG,0.7,,HIGH_TURNOVER,,PASS,0.0568
4,Q9R3emG,,,CONCENTRATED_WEIGHT,,PASS,


In [273]:
#making a list of failed alphas
failed_alphas = is_tests_df.query('result=="FAIL"')['alpha_id'].unique()

#making a list of passed alphas
passed_alphas = list(set(is_tests_df['alpha_id']).difference(failed_alphas))

print(f'Failed alphas:{failed_alphas}\nPassed alphas:{passed_alphas}')

Failed alphas:['Q9R3emG']
Passed alphas:[]


When you got a list of submittable alphas, you can call function submit_alpha()

In [274]:
#calling submit_alpha on all alphas that have passed the submission tests

submit_result = {alpha_id: submit_alpha(s, alpha_id) for alpha_id in passed_alphas}

In [275]:
#submit_result will have return values from the submit_result function

submit_result

{}

### Library Fuctions.

following are some other functions that you can use for your own analysis

**get_alpha_pnl(s, alpha_id)** - to get the pnl for an alpha

**get_alpha_yearly_stats(s, alpha_id)** - to get yearly statistics for an alpha

**get_self_corr(s, alpha_id)** - to get self correlation results for an alpha

**get_prod_corr(s, alpha_id)** - to get prod correlation results for an alpha

**get_check_submission(s, alpha_id)** - to get check submission result for an alpha

**check_self_corr_test(s, alpha_id)** - to check if alpha passes self correlation test (self_corr<0.7)

**check_prod_corr_test(s, alpha_id)** - to check if alpha passes prod correlation test (prod_corr<0.7)

**perfomance_comparison(s, alpha_id)** - to get the result of performance comparison for an alpha merged performance