In [1]:
import ace_lib as ace
import helpful_functions as hf
import pandas as pd
import glob
import requests
import plotly.express as px
import os
import pandas as pd
from multiprocessing import Pool
import json

### Start session
Enter credentials once - they will be saved to local folder and loaded each time

In [2]:
s = ace.start_session()

Complete biometrics authentication and press any key to continue: 
https://api.worldquantbrain.com/authentication/persona?inquiry=inq_Bw1hsjEhYxKY6BLyxfAmwjXRjQwS



### Create list of alpha expressions
#### Step 1. Download datsets

In [10]:
# 设置不同的directory放不同的运行结果
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)

def get_directory(region, factor_expression):
    directory = f'progress/{region}/{factor_expression}'
    create_directory(directory)
    return directory

def get_progress_file(region, factor_expression):
    directory = get_directory(region, factor_expression)
    return os.path.join(directory, 'progress.json')

def get_result_file(region, factor_expression, dataset_id):
    directory = get_directory(region, factor_expression)
    return os.path.join(directory, f'results_{dataset_id}.csv')


In [11]:
# 保存和加载字典的函数
def save_progress(progress, filename):
    with open(filename, 'w') as f:
        json.dump(progress, f)

def load_progress(filename):
    if os.path.exists(filename):
        try:
            with open(filename, 'r') as f:
                return json.load(f)
        except json.JSONDecodeError:
            print(f"Error reading {filename}. File might be corrupted. Recreating an empty progress file.")
            return {}
    return {}

In [12]:
datasets_df = hf.get_datasets(s) # by default we load all datasets USA TOP3000 delay 1
# datasets_df.tail() # DataFrame.head() shows first 5 rows of the dataframe 
print(len(datasets_df))


# 筛选潜在数据集
potential_datasets_df = datasets_df[
    (datasets_df["delay"] == 1) &
    (datasets_df["coverage"] > 0.7) & (datasets_df["coverage"] <= 1) &
    (datasets_df["fieldCount"] > 0) & (datasets_df["fieldCount"] < 3000) &
    (datasets_df["region"] == 'GLB') &
    (datasets_df["universe"] == 'MINVOL1M')
    # (datasets_df["userCount"] > 0) & (datasets_df["userCount"] < 100) &
    # (datasets_df["valueScore"] > 1) & (datasets_df["valueScore"] < 10)
]

region = 'GLB'  # 可以根据需要更改区域
factor_expression = 'ts_zscore({},12)'  # 可以根据需要更改因子表达式

print(len(potential_datasets_df))

88
50


#### Step 2. 对每个潜在数据集挖掘因子

In [13]:
# 模拟因子的函数
def simulate_factors(dataset_id, progress, region, factor_expression):
    try:
        result_file = get_result_file(region, factor_expression, dataset_id)
        if os.path.exists(result_file):
            print(f'Results for dataset {dataset_id} already exist. Loading existing results...')
            result_df = pd.read_csv(result_file)
        else:
            result_df = pd.DataFrame()
        
        datafields_df = hf.get_datafields(s, dataset_id=dataset_id)
        
        expression_list = [factor_expression.format(x) for x in datafields_df.id.values.tolist()]
        
        print(f'Processing {dataset_id}, Length: {len(expression_list)}:')
        field_progress = progress.get(str(dataset_id), {}).get("expressions", [])
        
        batch_size = 20  # 每次模拟的alpha数量
        for i in range(0, len(expression_list), batch_size):
            batch = expression_list[i:i + batch_size]
            batch_to_simulate = [expr for expr in batch if expr not in field_progress]
            
            if not batch_to_simulate:
                continue
            
            try:
                alphas = [ace.generate_alpha(expr, region=region, universe="MINVOL1M") for expr in batch_to_simulate]
                
                result = ace.simulate_alpha_list_multi(s, alphas)
                
                result_st2 = hf.prettify_result(result, clickable_alpha_id=False)
                result_df = pd.concat([result_df, result_st2], ignore_index=True)
                
                result_df.to_csv(result_file, index=False)
                
                field_progress.extend(batch_to_simulate)
                progress[str(dataset_id)] = {"status": "in_progress", "expressions": field_progress}
                save_progress(progress, get_progress_file(region, factor_expression))
                
            except Exception as e:
                print(f'Error processing batch {batch_to_simulate}: {e}')
        
        progress[str(dataset_id)]["status"] = "completed"
        save_progress(progress, get_progress_file(region, factor_expression))
        print(f'Results for dataset {dataset_id} processed successfully.')
        
    except Exception as e:
        print(f'Error processing dataset {dataset_id}: {e}')
        progress[str(dataset_id)] = {"status": "error", "expressions": field_progress}
        save_progress(progress, get_progress_file(region, factor_expression))

In [14]:
potential_datasets_df

Unnamed: 0,id,name,description,category,subcategory,region,delay,universe,coverage,turnover,valueScore,userCount,alphaCount,fieldCount,themes,researchPapers
0,analyst11,ESG scores,Environmental Social Governance scores that ex...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-esg', 'name': 'ESG'}",GLB,1,MINVOL1M,0.7917,,4.0,56,241,197,[],"[{'title': 'Research Paper 19: ESG Preference,..."
2,analyst15,Earnings forecasts,This dataset provides bottom-up forecast data ...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",GLB,1,MINVOL1M,0.9929,,2.0,139,1305,288,[],[]
3,analyst16,Real Time Estimates,This dataset provides real-time access to the ...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-crowdsourced-estimates', 'name...",GLB,1,MINVOL1M,0.9013,,2.0,104,564,42,[],[]
5,analyst39,Analyst estimates & financial ratios,This dataset provides a lot of information on ...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-esg', 'name': 'ESG'}",GLB,1,MINVOL1M,1.0,,3.0,43,83,5,[],[]
6,analyst4,Analyst Estimate Data for Equity,This dataset provides details and aggregations...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",GLB,1,MINVOL1M,0.7976,,3.0,29,75,18,[],[{'title': 'Research Paper 46: Time-Series and...
8,analyst44,Broker Estimates,This dataset integrates the detail estimates f...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-crowdsourced-estimates', 'name...",GLB,1,MINVOL1M,0.8173,,4.0,43,120,115,[],[]
9,analyst46,Analyst Investment insight Data,The dataset incorporates the investment insigh...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-crowdsourced-estimates', 'name...",GLB,1,MINVOL1M,0.7703,,3.0,38,69,5,[],[{'title': 'Research Paper 68: An Augmented q-...
10,analyst47,Alternative Analyst Investment Insight Data,The dataset incorporates the investment insigh...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-crowdsourced-estimates', 'name...",GLB,1,MINVOL1M,0.8409,,2.0,55,160,5,[],[]
11,analyst48,Dividend estimation data,Dividend estimation/history on general equity ...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",GLB,1,MINVOL1M,0.8096,,3.0,51,90,38,[],[]
12,analyst69,Fundamental Analyst Estimates,This dataset is a type of analyst dataset that...,"{'id': 'analyst', 'name': 'Analyst data'}","{'id': 'analyst-analyst-estimates', 'name': 'A...",GLB,1,MINVOL1M,0.9369,,2.0,196,1227,324,[],[]


In [15]:
# 加载进度字典
progress_file = get_progress_file(region, factor_expression)
progress = load_progress(progress_file)


# 过滤掉已完成的数据集，并保留在进度文件中标记为"completed"的数据集
dataset_ids = [
    dataset_id for dataset_id in datasets_df.id.values.tolist() 
    if str(dataset_id) not in progress or (isinstance(progress[str(dataset_id)], dict) and progress[str(dataset_id)].get("status") == "in_progress")
]
print(len(dataset_ids))
batch_size = 15  # 每批处理的数据集数量
batches = [dataset_ids[i:i + batch_size] for i in range(0, len(dataset_ids), batch_size)]


def run_simulation():
    for batch in batches:
        for dataset_id in batch:
            simulate_factors(dataset_id, progress, region, factor_expression)

run_simulation()
 

0


#### Step 3. 合并和分析结果

In [9]:
import glob

def analyze_results(region, factor_expression):
    directory = get_directory(region, factor_expression)
    all_results = pd.concat([pd.read_csv(f) for f in glob.glob(os.path.join(directory, 'results_*.csv'))])

    top_factors = all_results.sort_values(by='sharpe_ratio', ascending=False).head(10)
    print(top_factors)
    
    return top_factors

top_factors = analyze_results(region, factor_expression)

# 读取所有结果文件
all_results = pd.concat([pd.read_csv(f) for f in glob.glob('GLB_results_*.csv')])

# 进一步分析合并后的结果
# 例如筛选出表现最好的因子
top_factors = all_results.sort_values(by='sharpe', ascending=False)
top_factors


KeyError: 'sharpe_ratio'

### Change the expression - what would you do to improve alpha's results?
<br>Use your own logic to improve the alphas.</br>
<br>Here we will apply **group_rank** to overcome poor weight distribution.</br>

In [None]:
prospect_alphas = top_factors.loc[lambda x: x.fitness>0.2]['expression'].values
prospect_alphas
new_expression_list = ['group_rank(' + a + ',sector)' for a in prospect_alphas]
new_expression_list

In [None]:
#generating new simulation data for new expressions

new_alpha_list = [ace.generate_alpha(x, region= "USA", universe = "TOP3000",) for x in new_expression_list]

new_result = ace.simulate_alpha_list_multi(s, new_alpha_list)

In [None]:
result_st2 = hf.prettify_result(new_result, clickable_alpha_id=False)
result_st2

In [None]:
## We will join before and after dataframes by field

top_factors['field'] = top_factors['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()
result_st2['field'] = result_st2['expression'].apply(lambda st: st[st.find("(vec_avg(")+1:st.find(",")]).copy()

#merging results before and after improvement

compare_results = pd.merge(top_factors, result_st2, on='field', suffixes=('_before', '_after'))

#selecting column list returned by merging the two alpha results

col_list = ['fitness_before', 'fitness_after','sharpe_before', 'sharpe_after', 'alpha_id_before', 'alpha_id_after',
                 'expression_before', 'expression_after']

compare_results[col_list]\
    .style.format({'alpha_id_before': hf.make_clickable_alpha_id, 'alpha_id_after': hf.make_clickable_alpha_id})

### Check merged alpha performance

run the following code to do a before and after comparison of your merged pool post alpha submission

In [None]:
performance_comparison = ace.performance_comparison(s, result_st2['alpha_id'][0])

#### Step 4. 提交和监控

In [None]:
#to take a look at the combined result of all new alphas

is_tests_df = hf.concat_is_tests(new_result)
#making a list of failed alphas
failed_alphas = is_tests_df.query('result=="FAIL"')['alpha_id'].unique()

#making a list of passed alphas
passed_alphas = list(set(is_tests_df['alpha_id']).difference(failed_alphas))

print(f'Failed alphas:{failed_alphas}\nPassed alphas:{passed_alphas}')

for alpha_id in passed_alphas:
    submit_result = submit_alpha(s, alpha_id)
    print(f'Alpha {alpha_id} submitted: {submit_result}')
