In [1]:
import pickle
import pandas as pd

In [7]:
def assign_portfolio_by_size(df, senti_type):

    df = df.copy()
    senti_type_mark = senti_type + ' mark_in'
    df[senti_type_mark] = 0
    df_na = df[df[senti_type].isnull()]
    df = df.dropna()
    final_result_list = []
    for year in range(2006,2019,1):
        if year == 2006:
            start_month = 7
        else:
            start_month = 1
        year_df = df[df['year'] == year]
        for month in range(start_month,13,1):
            current_df = year_df[year_df['month'] == month]
            current_df = current_df[current_df['B'] >= 0]
            # Assign by senti
            current_df = current_df.sort_values(senti_type)
            total_size = current_df['market size'].sum()
            current_size = 0
            result_list=[]
            for _, row in current_df.iterrows():
                if current_size < (total_size * 0.5):
                    row[senti_type_mark] = 'S'
                else:
                    row[senti_type_mark] = 'B'
                current_size += row['market size']
                result_list.append(row)
            final_result_list.append(pd.DataFrame(result_list))
        result_df = pd.concat([pd.concat(final_result_list), df_na])
    return result_df

def assign_portfolio_by_btm(df, senti_type):
    '''
    senti_type == 'vader' or 'li'
    '''
    df = df.copy()
    senti_type_mark = senti_type + ' mark_in'
    df[senti_type_mark] = 0
    df_na = df[df[senti_type].isnull()]
    df = df.dropna()
    final_result_list = []
    for year in range(2006,2019,1):
        if year == 2006:
            start_month = 7
        else:
            start_month = 1
        year_df = df[df['year'] == year]
        for month in range(start_month,13,1):
            current_df = year_df[year_df['month'] == month]
            current_df = current_df[current_df['B'] >= 0]
            # Assign by senti
            current_df = current_df.sort_values(senti_type)
            total_size = current_df['market size'].sum()
            current_size = 0
            result_list=[]
            for _, row in current_df.iterrows():
                if current_size < (total_size * 0.3):
                    row[senti_type_mark] = 'L'
                elif current_size < (total_size * 0.7):
                    row[senti_type_mark] = 'M'
                else:
                    row[senti_type_mark] = 'H'
                current_size += row['market size']
                result_list.append(row)
            final_result_list.append(pd.DataFrame(result_list))
        result_df = pd.concat([pd.concat(final_result_list), df_na])
    return result_df

def build_btm_size_dict(df):
    portfolio_R_dict = {}
    for year in range(2006,2019,1):
        portfolio_R_dict[year] = {}
        if year == 2006:
            start_month = 7
        else:
            start_month = 1
        year_df = df[df['year'] == year]
        for month in range(start_month,13,1):
            portfolio_R_dict[year][month]={}
            month_df = year_df[year_df['month'] == month]
            portfolio_R_dict[year][month]['Rm'] = month_df['market*R'].sum() / month_df['market size'].sum()
            for mark in ['SL', 'SM', 'SH', 'BL', 'BM', 'BH']:
                current_df = month_df[month_df['mark_in'] == mark]
                R = current_df['market*R'].sum() / current_df['market size'].sum()
                portfolio_R_dict[year][month][mark] = R
    return portfolio_R_dict

def build_Rm_dict(port_dict):
    Rm_dict = {}
    for year in range(2006,2019,1):
        Rm_dict[year] = {}
        if year == 2006:
            start_month = 7
        else:
            start_month = 1
        for month in range(start_month,13,1):
            Rm_dict[year][month] = port_dict[year][month]['Rm']
    return Rm_dict

def build_dicts(port_dict):
    Rm_dict = {}
    HML_dict = {}
    SMB_dict = {}
    for year in range(2006,2019,1):
        Rm_dict[year] = {}
        HML_dict[year] = {}
        SMB_dict[year] = {}
        if year == 2006:
            start_month = 7
        else:
            start_month = 1
        for month in range(start_month,13,1):
            current_port_dict = port_dict[year][month]
            Rm_dict[year][month] = current_port_dict['Rm']
            HML_dict[year][month] = (current_port_dict['BH'] + current_port_dict['SH'] 
                                     - current_port_dict['BL'] - current_port_dict['SL']) / 2
            SMB_dict[year][month] = (current_port_dict['SH'] + current_port_dict['SM'] + current_port_dict['SL']
                                    - current_port_dict['BH'] - current_port_dict['BM'] - current_port_dict['BL']) / 3
    return Rm_dict, HML_dict, SMB_dict

In [4]:
df = pd.read_pickle('D:/Thesis_data/df_final200107.pickle')
df = assign_portfolio_by_size(df, 'market size')
df = assign_portfolio_by_btm(df, 'btm')
df['mark_in'] = df['market size mark_in'] + df['btm mark_in']
portfolio_R_dict = build_btm_size_dict(df)

In [17]:
df = assign_portfolio_by_btm(df, 'vader p_neg_sentence')
df['mark_in'] = df['market size mark_in'] + df['vader p_neg_sentence mark_in']
portfolio_R_dict = build_btm_size_dict(df)

In [18]:
_, neg_PMN_dict, __ = build_dicts(portfolio_R_dict)

In [19]:
with open('D:/Thesis_data/insample_factors/neg_tone_dict0109.pickle', 'wb') as f:
    pickle.dump(neg_PMN_dict, f)

In [12]:
df.columns

Index(['Ticker', 'market size', 'R', 'price', 'year', 'month', 'B',
       'size mark', 'book mark', 'btm', 'book market mark', 'vader', 'li',
       'vader mark', 'li mark', 'market*R', 'vader avg', 'short',
       'vader p_neg', 'vader p_pos', 'vader n_words', 'vader p_neg mark',
       'vader p_pos mark', 'vader avg mark', 'vader n_words mark',
       'market size mark', 'btm mark', 'vader p_neg_sentence',
       'vader p_pos_sentence', 'vader p_neg_sentence mark',
       'vader p_pos_sentence mark', 'not nu', 'not nu mark', 'avg size mark',
       'p_pos size mark', 'p_pos_sentence size mark', 'p_neg size mark',
       'p_neg_sentence size mark', 'vader neg avg', 'vader pos avg',
       'vader pos avg mark', 'vader neg avg mark', 'market size mark_in',
       'btm mark_in', 'mark_in', 'vader mark_in'],
      dtype='object')