In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Aug  9 15:54:38 2019

@author: hurenjie
"""

#%%
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
import tushare as ts
import urllib,time
# import talib
# from talib import MA_Type

In [2]:

#%%
def MACD_MA_next_day_prediction(df, ma_period=5, MAtype = 0, fast_period=12, slow_period=26, signal_period=9, return_date=False, print_latest = False):
    '''
    函数说明:
    MACD的值今天相较于昨天上涨，同时MA斜率今天相较于昨天向上上涨，则下一天价格上涨的概率
    
    参数说明:
    ma_period:
    计算MA的时间窗口
    MAtype:
    MA的类型，参见talib.MA_Type中的各种类型
    fast_period:
    计算MACD快线的时间窗口长度
    slow_period:
    计算MACD慢线的时间窗口长度
    signal_period:
    计算MACD信号的时间窗口长度
    return_date:
    等于True时返回输入的数据中MACD和MA同时上升或者下降的时间点
    print_latest:
    等于True时返回输入的数据中最近的一个时间点MACD和MA是否同时上升或者下降
    '''
    ma = talib.MA(df['close'], timeperiod = ma_period, matype = MAtype)
    
    macd, macdsignal, macdhist = talib.MACD(df['close'], 
                                    fastperiod=fast_period, 
                                    slowperiod=slow_period, 
                                    signalperiod=signal_period)
    df['MACD_diff'] = macd.diff()
    df['MA_diff'] = ma.diff()
    df['close_diff_shift(-1)'] = df['close'].diff().shift(-2) # 明天的价格是否上涨
    
    MACD_up_next_day = df[df['MACD_diff'] > 0]
    MACD_down_next_day = df[df['MACD_diff'] < 0]
    
    MACD_MA_up_next_day = MACD_up_next_day[MACD_up_next_day['MA_diff'] > 0]
    MACD_MA_down_next_day = MACD_down_next_day[MACD_down_next_day['MA_diff'] < 0]
    
    MACD_MA_close_up_next_day = MACD_MA_up_next_day[MACD_MA_up_next_day['close_diff_shift(-1)'] > 0]
    MACD_MA_close_down_next_day = MACD_MA_down_next_day[MACD_MA_down_next_day['close_diff_shift(-1)'] < 0]
    
    if print_latest == True: # 当 print_latest == True 的时候，我们要打印出以下的内容
        if (df.index[-1] in MACD_MA_up_next_day.index):
            print('输入的数据最近一天在MACD与MA'+str(ma_period)+'上升的阶段')
        elif (df.index[-1] in MACD_MA_down_next_day.index):
            print('输入的数据最近一天在MACD与MA'+str(ma_period)+'下跌的阶段')
        else:
            print('输入的数据最近一天既不在MACD与MA'+str(ma_period)+'上升的阶段'+
                  '也不在MACD与MA'+str(ma_period)+'下跌的阶段')
    
    
    try:
        MACD_MA_close_up_next_day_prob = MACD_MA_close_up_next_day.shape[0]/MACD_MA_up_next_day.shape[0]
        MACD_MA_close_down_next_day_prob = MACD_MA_close_down_next_day.shape[0]/MACD_MA_down_next_day.shape[0]
    except:
        print('数据中不存在MACD和MA同时上涨或者下跌时，第二天价格上涨或者下跌的状况')
    else:
        if return_date == False:
            return {'输入的数据中MACD和MA'+ str(ma_period) + '同时上升时第二天价格上升的概率':MACD_MA_close_up_next_day_prob,
                    '输入的数据中MACD和MA'+ str(ma_period) + '同时下跌时第二天价格下跌的概率':MACD_MA_close_down_next_day_prob}
            
        elif return_date == True:
            return {'输入的数据中MACD和MA'+ str(ma_period) + '同时上升时第二天价格上升的概率':MACD_MA_close_up_next_day_prob,
                    '输入的数据中MACD和MA'+ str(ma_period) + '同时下跌时第二天价格下跌的概率':MACD_MA_close_down_next_day_prob,
                    '输入的数据中MACD和MA'+ str(ma_period) + '同时上升时第二天价格上升的时间点':list(MACD_MA_up_next_day.index),
                    '输入的数据中MACD和MA'+ str(ma_period) + '同时下跌时第二天价格下跌的时间点':list(MACD_MA_down_next_day.index) }
    
#%%
def MACD_diff_next_day_prediction(df, fast_period=12, slow_period=26, signal_period=9, return_date=False, print_latest = False):
    '''
    函数说明:
    MACD值当天的值小于前一天时，后一天的close小于当天close的概率/次数
    
    参数说明:
    fast_period:
    计算MACD快线的时间窗口长度
    slow_period:
    计算MACD慢线的时间窗口长度
    signal_period:
    计算MACD信号的时间窗口长度
    return_date:
    等于True时返回输入的数据中MACD上升或者下降的时间点
    print_latest:
    等于True时返回输入的数据中最近的一个时间点MACD是否上升或者下降
    '''
    df['close_diff'] = df['close'].diff().shift(-2)
    macd, macdsignal, macdhist = talib.MACD(df['close'], 
                                        fastperiod=fast_period, 
                                        slowperiod=slow_period, 
                                        signalperiod=signal_period)
    df['MACD'] = macd
    df['MACD_diff'] = macd.shift(-1)
    
    MACD_nextday_up = df[df['MACD_diff'] > 0]
    MACD_nextday_down = df[df['MACD_diff'] < 0]
    
    MACD_nextday_close_up = MACD_nextday_up[MACD_nextday_up['close_diff'] > 0]
    MACD_nextday_close_down = MACD_nextday_down[MACD_nextday_down['close_diff'] < 0]
    if print_latest == True:
        if (df['MACD_diff'][-2]>0):
            print(str(df.index[-1].date()) + 'MACD' + '相对于' + str(df.index[-2].date()) + '上升')
        elif (df['MACD_diff'][-2]<0):
            print(str(df.index[-1].date()) + 'MACD' + '相对于' + str(df.index[-2].date()) + '下降')
        
    try:
        MACD_nextday_close_up_prob = MACD_nextday_close_up.shape[0]/MACD_nextday_up.shape[0]
        MACD_nextday_close_down_prob = MACD_nextday_close_down.shape[0]/MACD_nextday_down.shape[0]   
    except:
        print('数据中出现金叉或者死叉的时间点并没有在第二天发生价格的下跌')
#        df.drop(columns = ['close_diff', 'MACD', 'MACD_diff'], inplace = True)
    else:
#        df.drop(columns = ['close_diff', 'MACD', 'MACD_diff'], inplace = True)
        if return_date == False:
            return {'输入的数据中MACD上升时第二天价格上涨的概率':MACD_nextday_close_up_prob,
                    '输入的数据中MACD上升时第二天价格下跌的概率':MACD_nextday_close_down_prob}
        elif return_date == True:
            return {'输入的数据中MACD上升时第二天价格上涨的概率':MACD_nextday_close_up_prob,
                    '输入的数据中MACD下降时第二天价格下跌的概率':MACD_nextday_close_down_prob,
                    '输入的数据中MACD上升的时间点':list(MACD_nextday_up.index),
                    '输入的数据中MACD下降的时间点':list(MACD_nextday_down.index)}
            
#%%
def MACD_golden_or_dead_next_day_prediction(df, fast_period=12, slow_period=26, signal_period=9, return_date=False, print_latest = False):
    '''
    函数说明:
    MACD出现金叉/死叉（的状态）时，下一天对应的上涨/下跌的概率是多少
    
    参数说明:
    fast_period:
    计算MACD快线的时间窗口长度
    slow_period:
    计算MACD慢线的时间窗口长度
    signal_period:
    计算MACD信号的时间窗口长度
    return_date:
    等于True时返回输入的数据中MACD出现金叉或者死叉的时间点
    print_latest:
    等于True时返回输入的数据中最近的一个时间点MACD是否出现金叉或者死叉
    '''
    df['close_diff'] = df['close'].diff().shift(-1)
    macd, macdsignal, macdhist = talib.MACD(df['close'], 
                                            fastperiod=fast_period, 
                                            slowperiod=slow_period, 
                                            signalperiod=signal_period)
    
    df['MACD'] = macd     
    df['MACD lag 1'] = macd.shift(1) # 把MACD的值滞后一天
    
    MACD_golden_cross = df[(df['MACD lag 1']<=0) & (df['MACD'] >= 0)]
    MACD_dead_cross = df[(df['MACD lag 1']>=0) & (df['MACD'] <= 0)]
    
    golden_cross_nextday_up_days = MACD_golden_cross[MACD_golden_cross['close_diff'] > 0]
    dead_cross_nextday_down_days = MACD_dead_cross[MACD_dead_cross['close_diff'] < 0]
    
    if print_latest == True:
        if (df.index[-1] not in MACD_golden_cross.index) & (df.index[-1] not in MACD_dead_cross.index):
            print(str(df.index[-1].date()) + '并没有发生金叉或者死叉的现象')
        elif (df.index[-1] in MACD_golden_cross.index):
            print(str(df.index[-1].date()) + '发生金叉')
        elif (df.index[-1] in MACD_dead_cross.index):
            print(str(df.index[-1].date()) + '发生死叉')
    
    try:
        golden_cross_nextday_up_prob = golden_cross_nextday_up_days.shape[0]/MACD_golden_cross.shape[0]
        dead_cross_nextday_down_prob = dead_cross_nextday_down_days.shape[0]/MACD_dead_cross.shape[0]
    except:
            print('数据不满足判断金叉死叉的条件，或者在数据中金叉死叉发生后并没有回归的现象发生')
            df.drop(columns = ['MACD', 'MACD lag 1', 'close_diff'], inplace = True)
    else:
            
        df.drop(columns = ['MACD', 'MACD lag 1', 'close_diff'], inplace = True)
        
        if return_date == False:
            return {'输入的数据中MACD出现金叉时下一天对应的上涨概率':golden_cross_nextday_up_prob, 
                    '输入的数据中MACD出现死叉时下一天对应的下跌概率':dead_cross_nextday_down_prob}
        elif return_date == True:
            return {'输入的数据中MACD出现金叉时下一天对应的上涨概率':golden_cross_nextday_up_prob, 
                    '输入的数据中MACD出现死叉时下一天对应的下跌概率':dead_cross_nextday_down_prob,
                    '输入的数据中MACD出现金叉的时间点':list(MACD_golden_cross.index),
                    '输入的数据中MACD出现死叉的时间点':list(MACD_dead_cross.index)}

In [2]:
#%%
def bias_next_day_prediction(df, zscorethreshold, bias_time_window=30, zscore_time_window=30, return_date = False, print_latest = False):
    '''
    函数说明:
    乖离率绝对值太大时，或者大到什么幅度时，大概率回归(通过zscorethreshold来确定)？
    
    参数说明:
    zscorethreshold:
    zscore的阈值，当超过这个阈值时认为乖离率偏离较大
    bias_time_window:
    计算bias的时间窗口长度
    zscore_time_window:
    计算zscore的时间窗口长度
    return_date:
    等于True时返回输入的数据中zscore超过阈值zscorethreshold的时间点
    print_latest:
    输入的数据中,最近的时间点处是否超过zscore阈值zscorethreshold
    '''
    df['close_diff'] = df['close'].diff().shift(-1)
    
#    compute bias
    ave = df['close'].rolling(bias_time_window).mean()
    bias = (df['close']-ave)/ave
    
#    compute zscore
    df['bias'] = bias
    df['zscore'] = (df['bias'] - df['bias'].rolling(zscore_time_window).mean())/df['bias'].rolling(zscore_time_window).apply(np.std)
    
    larger_than_positive_threshold = df[df['zscore'] >= zscorethreshold]
    smaller_than_negative_threshold = df[df['zscore'] <= zscorethreshold]
    
    larger_than_positive_threshold_nextday_down = larger_than_positive_threshold[larger_than_positive_threshold['close_diff']<0]
    smaller_than_negative_threshold_nextday_up = smaller_than_negative_threshold[smaller_than_negative_threshold['close_diff']>0]
    
    if print_latest == True:
        if (df.index[-1] not in larger_than_positive_threshold.index) & (df.index[-1] not in smaller_than_negative_threshold.index):
            print(str(df.index[-1].date()) + '并没有发生z-score超出阈值'+str(zscorethreshold)+'的现象')
        elif (df.index[-1] in larger_than_positive_threshold.index):
            print(str(df.index[-1].date()) + '向上超出了z-score设定的阈值'+str(zscorethreshold))
        elif (df.index[-1] in smaller_than_negative_threshold.index):
            print(str(df.index[-1].date()) + '向下超出了z-score设定的阈值'+str(zscorethreshold))
    
    try:
        larger_than_positive_threshold_nextday_down_prob = larger_than_positive_threshold_nextday_down.shape[0]/larger_than_positive_threshold.shape[0]
        smaller_than_negative_threshold_nextday_up_prob = smaller_than_negative_threshold_nextday_up.shape[0]/smaller_than_negative_threshold.shape[0]
#        df.drop(columns = ['close_diff', 'bias', 'zscore'], inplace =  True)
    except:
        print('阈值选取不合适，请选取小一点的阈值 zscorethreshold')
        df.drop(columns = ['close_diff', 'bias', 'zscore'], inplace =  True)
    else:
        df.drop(columns = ['close_diff', 'bias', 'zscore'], inplace =  True)
        if return_date == False:
            return {'输入的数据中乖离率zscore大于'+str(zscorethreshold)+'时第二天下跌的概率':larger_than_positive_threshold_nextday_down_prob,
                    '输入的数据中乖离率zscore小于'+str(zscorethreshold)+'时第二天上涨的概率':smaller_than_negative_threshold_nextday_up_prob}
        elif return_date == True :
            return {'输入的数据中乖离率zscore大于'+str(zscorethreshold)+'时第二天下跌的概率':larger_than_positive_threshold_nextday_down_prob,
                    '输入的数据中乖离率zscore小于'+str(zscorethreshold)+'时第二天上涨的概率':smaller_than_negative_threshold_nextday_up_prob,
                    '向上超出zscore阈值'+str(zscorethreshold)+'的时间点':list(larger_than_positive_threshold.index),
                    '向下超出zscore阈值'+str(zscorethreshold)+'的时间点':list(smaller_than_negative_threshold.index)}
            
#%%
def Boll_next_day_prediction(df, time_period=26, num_of_std=2, MAtype=0, return_date = False): #MAtype来源于talib中的MA_Type
    '''
    函数说明:
    向上或者向下击穿布林线，之后反弹概率
    
    参数说明:
    time_period:
    计算布林线的时间窗口长度
    num_of_std:
    计算布林线偏离均线的标准差数量
    MAtype:
    计算布林线中MA的类型，参见talib.MA_Type
    return_date:
    等于True时返回向上击穿布林线上端和向下击穿布林线下端的时间点
    '''
    upperband, middleband, lowerband = talib.BBANDS(df.close, timeperiod=time_period, nbdevup=num_of_std, nbdevdn=num_of_std, matype=MAtype)
    df['close_diff'] = df['close'].diff().shift(-1)
    df['upperband'] = upperband
    df['middleband'] = middleband
    df['lowerband'] = lowerband
    
    upper_hit = df[df['close'] > df['upperband']]
    lower_hit = df[df['close'] < df['lowerband']]
    
    return_after_upper_hit = upper_hit[upper_hit['close_diff'] < 0]
    return_after_lower_hit = lower_hit[lower_hit['close_diff'] > 0]
    
    if (df.index[-1] not in upper_hit.index) & (df.index[-1] not in lower_hit.index):
        print(str(df.index[-1].date()) + '并没有发生击穿布林线的现象')
    elif (df.index[-1] in upper_hit.index):
        print(str(df.index[-1].date()) + '布林线上穿')
    elif (df.index[-1] in lower_hit.index):
        print(str(df.index[-1].date()) + '布林线下穿')

    try:
        return_after_upper_hit_prob = return_after_upper_hit.shape[0]/upper_hit.shape[0]
        return_after_lower_hit_prob = return_after_lower_hit.shape[0]/lower_hit.shape[0]
    except:
        print('没有出现布林线被击穿的现象')
        df.drop(columns = ['close_diff', 'upperband', 'middleband', 'lowerband'], inplace = True)
    else:
        df.drop(columns = ['close_diff', 'upperband', 'middleband', 'lowerband'], inplace = True)
        if return_date == True:
            return {'输入的数据中发生向上击穿布林线后反弹的概率':return_after_upper_hit_prob,
                    '输入的数据中发生向下击穿布林线后反弹的概率':return_after_lower_hit_prob, 
                    '输入的数据中发生向上击穿的时间点':list(upper_hit.index),
                    '输入的数据中发生向下击穿的时间点':list(lower_hit.index)}
        elif return_date == False:
            return {'输入的数据中发生向上击穿布林线后反弹的概率':return_after_upper_hit_prob,
                    '输入的数据中发生向下击穿布林线后反弹的概率':return_after_lower_hit_prob}

#%%
def extreme_value(df, threshold): # threshold 在此应该定义为一个百分数,代表我们认为的涨跌幅度阈值
    '''
    函数说明:
    当大盘(或者其他股票)日涨幅/跌幅绝对值超过 threshold%，下一天上涨或者下跌的概率/历史上的次数有多少
    
    参数说明:
    threshold:日涨幅/跌幅绝对值超过 threshold%,为一个阈值
    '''
    
    df['close_pct_ch'] = df['close'].pct_change().shift(-1)
    df['close_diff'] = df['close'].diff().shift(-2)
    df_up = df[df['close_diff'] > 0]
    df_down = df[df['close_diff'] < 0]
    
    up = df_up[df_up['close_pct_ch'] > threshold]
    down = df_down[df_down['close_pct_ch'] < -threshold]
    
    try:
        prob_up = np.shape(up)[0]/np.shape(df_up)[0]
        prob_down = np.shape(down)[0]/np.shape(df_down)[0]
    except:
        print('检查数据，里面不存在使得价格变动超过阈值' + str(threshold) + '的情况发生')
        df.drop(columns = ['close_pct_ch', 'close_diff'], inplace = True)
    else:
        
        if df['close_diff'].iloc[-3] > 0:
            result = prob_up # 当天价格上涨，利用极端值因子估计法估计出第二天继续上涨的概率
            print('当天价格上涨,该函数计算的是第二天价格继续上涨的概率')
        else:
            result = prob_down # 当天价格下跌，利用极端值因子估计法估计出第二天继续下跌的概率
            print('当天价格下跌,该函数计算的事第二天价格继续下跌的概率')
        df.drop(columns = ['close_pct_ch', 'close_diff'], inplace = True)
        return result

#%%
def simple_consecutive_predict_up_or_down(df, N, predict_latest = False): # N代表我们认为的连续多少天持续的涨或者跌
    '''
    函数说明:
    连续涨/跌N（3、4、5、6）天之后，下一天涨/跌的概率
    
    参数说明:
    N:
    连续涨/跌的天数
    predict_latest:
    根据导入的数据中最近的时间点来确定N的取值
    '''
    df.reset_index(inplace = True)
    df['close_diff'] = df['close'].diff()
    df['signal'] = 0
    for i in df.index:
        if df['close_diff'][i] < 0:
            df['signal'][i] = -1
        elif df['close_diff'][i] > 0:
            df['signal'][i] = 1
    consecutive_days = N
    
    if predict_latest == True:
        print('此函数最终输出的结果由输入的数据末尾连续多少天价格上涨或者下跌多决定,和输入的参数N值没有关系\n')
        latest_consecutive_days = 0
        
        if df['close_diff'].iloc[-1] < 0:
            for i in df.index[::-1]:
                if df['close_diff'].iloc[i] < 0 :
                    latest_consecutive_days += 1
                elif df['close_diff'].iloc[i] > 0:
                    break
            print('输入的数据中连续' + str(latest_consecutive_days) + '下跌\n')
                
                
        elif df['close_diff'].iloc[-1] > 0:
            for i in df.index[::-1]:
                if df['close_diff'].iloc[i] > 0:
                    latest_consecutive_days += 1
                elif df['close_diff'].iloc[i] < 0:
                    break
            print('输入的数据中连续' + str(latest_consecutive_days) + '上涨\n')
        N = latest_consecutive_days
#        print('输入的数据中连续' + str(N) + '天上涨或者下跌')
        
    elif predict_latest == False:
        print('此函数最终输出的结果由输入的N值决定，最终计算输入的数据中存在多少次的连续N天上涨或者下跌后，继续上涨或者下跌的概率')
        N = consecutive_days
        
    df['signal_sum'] = df['signal'].rolling(N).sum()
    df_up_Ndays = df[df['signal_sum'] == N]
    next_day_is_up = 0
    
    df_down_Ndays = df[df['signal_sum'] == -N]
    next_day_is_down = 0
    
    for i in df_up_Ndays.index:
        if i!=df.index[-1]:
            if df['close_diff'][i+1] > 0:
                next_day_is_up += 1
    
    for i in df_down_Ndays.index:
        if i!=df.index[-1]:
            if df['close_diff'][i+1] < 0:
                next_day_is_down += 1
    df.drop(columns = ['close_diff', 'signal', 'signal_sum'], inplace = True)
    df.set_index('date', inplace = True)
    
    try:
        prob_up = next_day_is_up/np.shape(df_up_Ndays)[0]
        prob_down = next_day_is_down/np.shape(df_down_Ndays)[0]
    except:
        print('输入的数据中没有出现连续上涨或者下跌的情况')
    else:
        return {'连续'+ str(N) +'天上涨后第二天上涨的概率': prob_up, '连续'+ str(N) +'天下跌后第二天下跌的概率':prob_down}

#%%


In [14]:
def long_tail_cross_star_next_day_prediction(df, body_ratio, tail_ratio, return_date = False, print_latest = False):
    """
    函数说明：
    计算昨天下跌，今天长尾十字星(长尾在下) 或者昨天上涨，今天长尾十字星(长尾在上)，明天上涨或下跌的概率
    body_ratio反应的是十字星open和close的大小差异， tail_ratio 指定的是十字星尾巴和头部大小差异

    参数说明:
    body_ratio 的算法:
    昨日下跌，今日的十字星body_ratio = (open-close)/(high-low), 昨日上涨，今日的十字星body_ratio = (close-open)/(high-low)
    tail_ratio 的算法:
    昨日下跌，今日的十字星tail_ratio = (close-low)/(high-open)，昨日下跌，今日的十字星tail_ratio = (high-close)/(open-low)
    return_date:
    return_date = True时返回出现长尾十字星的时间点，return_date = False时不返回时间点
    """

    df['close_diff_shift(1)'] = df['close'].diff().shift(1) #  用来判断昨天是否下跌
    df['close_diff_shift(-1)'] = df['close'].diff().shift(-1) # 用来判断明天是否上涨
     
    down_one_day_before = df[df['close_diff_shift(1)'] < 0] # 前天下跌的数据
    up_one_day_before = df[df['close_diff_shift(1)'] > 0] # 前天上涨的数据

    down_one_day_before_longtailcross = down_one_day_before[ ((down_one_day_before['open'] - down_one_day_before['close'])/(down_one_day_before['high'] - down_one_day_before['low']) < body_ratio) &
                                                             ( (down_one_day_before['close'] - down_one_day_before['low'])/(down_one_day_before['high']-down_one_day_before['open']) > tail_ratio)]
     
    up_one_day_before_longtailcross = up_one_day_before[((up_one_day_before['close'] - up_one_day_before['open'])/(up_one_day_before['high'] - up_one_day_before['low']) < body_ratio) & 
                                                         ((up_one_day_before['high'] - up_one_day_before['close'])/(up_one_day_before['open'] - up_one_day_before['low']) > tail_ratio)]
     
    down_one_day_before_longtailcross_nextday_up = down_one_day_before_longtailcross[down_one_day_before_longtailcross['close_diff_shift(-1)'] > 0]
    up_one_day_before_longtailcross_nextday_down = up_one_day_before_longtailcross[up_one_day_before_longtailcross['close_diff_shift(-1)'] < 0]
     
     
    if print_latest == True:
        if df.index[-1] in down_one_day_before_longtailcross.index:
            print('输入的数据中最近的时间点'+ str(df.index[-1].date()) + '发生了昨天价格下跌，今天长尾十字星(长尾在下)的现象\n')
        elif df.index[-1] in up_one_day_before_longtailcross.index:
            print('输入的数据中最近的时间点'+ str(df.index[-1].date()) + '发生了昨天价格上升，今天长尾十字星(长尾在上)的现象\n')
        else:
            print('输入的数据中最近的时间点'+ str(df.index[-1].date()) + '并没有发生昨天价格下跌，今天长尾十字星(长尾在下),或者昨天价格上升，今天长尾十字星(长尾在上)的现象\n')
     
     
    try:
        down_one_day_before_longtailcross_nextday_up_prob = down_one_day_before_longtailcross_nextday_up.shape[0]/down_one_day_before_longtailcross.shape[0]
        up_one_day_before_longtailcross_nextday_down_prob = up_one_day_before_longtailcross_nextday_down.shape[0]/up_one_day_before_longtailcross.shape[0]
         
    except:
        print('导入的数据中没有出现昨天下跌，今天出现长尾十字星，明天上涨的情况，或者没有出现昨天上涨，今天出现长尾十字星，明天下跌的情况')
        df.drop(columns = ['close_diff_shift(1)', 'close_diff_shift(-1)'], inplace = True)
         
    else:
        df.drop(columns = ['close_diff_shift(1)', 'close_diff_shift(-1)'], inplace = True)
        if return_date == False:
            return {'导入的数据中昨天下跌，今天出现长尾十字星，明天上涨的概率是':down_one_day_before_longtailcross_nextday_up_prob,
                     '导入的数据中昨天上涨，今天出现长尾十字星，明天下跌的概率是':up_one_day_before_longtailcross_nextday_down_prob}
             
        elif return_date == True:
            return {'导入的数据中昨天下跌，今天出现长尾十字星，明天上涨的概率是':down_one_day_before_longtailcross_nextday_up_prob,
                     '导入的数据中昨天上涨，今天出现长尾十字星，明天下跌的概率是':up_one_day_before_longtailcross_nextday_down_prob,
                     '导入的数据中昨天下跌，今天出现长尾十字星的时间点是':list(down_one_day_before_longtailcross.index),
                     '导入的数据中昨天上涨，今天出现长尾十字星的时间点是':list(up_one_day_before_longtailcross.index)}

# 网易爬数据

In [3]:
def get_page(url):  #获取页面数据
    req=urllib.request.Request(url,headers={
        'Connection': 'Keep-Alive',
        'Accept': 'text/html, application/xhtml+xml, */*',
        'Accept-Language':'zh-CN,zh;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko'
    })
    opener=urllib.request.urlopen(req)
    page=opener.read()
    return page

def get_index_history_byNetease(index_temp):
    """
    :param index_temp: for example, 'sh000001' 上证指数
    :return:
    """
    index_type=index_temp[0:2]
    index_id=index_temp[2:]
    if index_type=='sh':
        index_id='0'+index_id
    if index_type=="sz":
        index_id='1'+index_id
    url='http://quotes.money.163.com/service/chddata.html?code=%s&start=19900101&end=%s&fields=TCLOSE;HIGH;LOW;TOPEN;LCLOSE;CHG;PCHG;VOTURNOVER;VATURNOVER'%(index_id,time.strftime("%Y%m%d"))

    page=get_page(url).decode('gb2312') #该段获取原始数据
    page=page.split('\r\n')
    col_info=page[0].split(',')   #各列的含义
    index_data=page[1:]     #真正的数据

    #为了与现有的数据库对应，这里我还修改了列名，大家不改也没关系
    col_info[col_info.index('日期')]='交易日期'   #该段更改列名称
    col_info[col_info.index('股票代码')]='指数代码'
    col_info[col_info.index('名称')]='指数名称'
    col_info[col_info.index('成交金额')]='成交额'

    index_data=[x.replace("'",'') for x in index_data]  #去掉指数编号前的“'”
    index_data=[x.split(',') for x in index_data]

    index_data=index_data[0:index_data.__len__()-1]   #最后一行为空，需要去掉
    pos1=col_info.index('涨跌幅')
    pos2=col_info.index('涨跌额')
    posclose=col_info.index('收盘价')
    index_data[index_data.__len__()-1][pos1]=0      #最下面行涨跌额和涨跌幅为None改为0
    index_data[index_data.__len__()-1][pos2]=0
    for i in range(0,index_data.__len__()-1):       #这两列中有些值莫名其妙为None 现在补全
        if index_data[i][pos2]=='None':
            index_data[i][pos2]=float(index_data[i][posclose])-float(index_data[i+1][posclose])
        if index_data[i][pos1]=='None':
            index_data[i][pos1]=(float(index_data[i][posclose])-float(index_data[i+1][posclose]))/float(index_data[i+1][posclose])

    # print(col_info)
    return [index_data,col_info]
# --------------------- 
# 版权声明：本文为CSDN博主「multiangle」的原创文章，遵循CC 4.0 by-sa版权协议，转载请附上原文出处链接及本声明。
# 原文链接：https://blog.csdn.net/u014595019/article/details/48445223

In [4]:
sh = get_index_history_byNetease("sh000001") 
df_ne = pd.DataFrame()
df_ne['columns_to_split'] = pd.DataFrame(sh).T[0]
col_list = list(pd.DataFrame(sh).T[1])[:12]
df_ne['columns_to_split'] = df_ne['columns_to_split'].apply(lambda x: str(x))
df_ne['columns_to_split'] = df_ne['columns_to_split'].apply(lambda x: x.replace("[", ""))
df_ne['columns_to_split'] = df_ne['columns_to_split'].apply(lambda x: x.replace("]", ""))

df_ne = df_ne['columns_to_split'].str.split(",", 12, expand = True)
df_ne.columns = col_list
df_ne.sort_values("交易日期", inplace = True)
df_ne.tail()

Unnamed: 0,交易日期,指数代码,指数名称,收盘价,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,成交量,成交额
4,'2019-08-13','000001','上证指数','2797.2596','2802.0415','2790.608','2798.0512','2814.9944','-17.7348','-0.63','130916505','1.53957510634e+11'
3,'2019-08-14','000001','上证指数','2808.9146','2829.8906','2807.719','2824.4864','2797.2596','11.655','0.4167','144014641','1.7394488328e+11'
2,'2019-08-15','000001','上证指数','2815.7976','2815.8964','2756.8341','2762.339','2808.9146','6.883','0.245','152772152','1.73670539178e+11'
1,'2019-08-16','000001','上证指数','2823.8238','2840.3164','2811.8016','2817.5708','2815.7976','8.0262','0.285','148582344','1.86139370282e+11'
0,'2019-08-19','000001','上证指数','2883.096','2883.096','2829.8542','2835.5181','2823.8238','59.2722','2.099','214546668','2.47092216349e+11'


In [5]:
df_ne.columns = ['交易日期', '指数代码', '指数名称', 'close', '最高价', '最低价', '开盘价', '前收盘', '涨跌额', '涨跌幅',
       '成交量', '成交额']
df_ne['close'] = df_ne['close'].apply(lambda x: x[2:-1])
df_ne['close'] = df_ne['close'].apply(lambda x: float(x))

In [6]:
df_ne

Unnamed: 0,交易日期,指数代码,指数名称,close,最高价,最低价,开盘价,前收盘,涨跌额,涨跌幅,成交量,成交额
7009,'1990-12-19','000001','上证指数',99.9800,'99.98','95.79','96.05','None',0,0,'1260','494000.0'
7008,'1990-12-20','000001','上证指数',104.3900,'104.39','99.98','104.3','99.98','4.41','4.4109','197','84000.0'
7007,'1990-12-21','000001','上证指数',109.1300,'109.13','103.73','109.07','104.39','4.74','4.5407','28','16000.0'
7006,'1990-12-24','000001','上证指数',114.5500,'114.55','109.13','113.57','109.13','5.42','4.9666','32','31000.0'
7005,'1990-12-25','000001','上证指数',120.2500,'120.25','114.55','120.09','114.55','5.7','4.976','15','6000.0'
7004,'1990-12-26','000001','上证指数',125.2700,'125.27','120.25','125.27','120.25','5.02','4.1746','100','53000.0'
7003,'1990-12-27','000001','上证指数',125.2800,'125.28','125.27','125.27','125.27','0.01','0.008','66','104000.0'
7002,'1990-12-28','000001','上证指数',126.4500,'126.45','125.28','126.39','125.28','1.17','0.9339','108','88000.0'
7001,'1990-12-31','000001','上证指数',127.6100,'127.61','126.48','126.56','126.45','1.16','0.9174','78','60000.0'
7000,'1991-01-02','000001','上证指数',128.8400,'128.84','127.61','127.61','127.61','1.23','0.9639','91','59000.0'


In [8]:
extreme_value(df_ne, 0.021)

当天价格上涨,该函数计算的是第二天价格继续上涨的概率


0.09420682187330806