In [1]:
import os
import re
import time
import pickle
from collections import Counter
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
####################################
###            VADER            ####              
####################################
def clean_mda(mda):
    mda = re.sub(r'[(),&/:; ]+',' ', mda)
    return re.sub(r'[^A-Za-z!?. ]+', '', mda)
    return mda

def vader_score(mda):
    senti_dict = {}
    mda = clean_mda(mda)
    tic = time.time()
    result_list = []
    n_pos = 0
    n_neg = 0
    n_neu = 0
    n_total_words = 0
    for sentence in mda.split('.'):
        n_words = len(sentence.split())
        if n_words < 3:
            continue
        n_total_words = n_total_words + n_words
        sentence_score = sia.polarity_scores(sentence)
        if sentence_score['compound'] > 0.05:
            n_pos = n_pos+1
        elif sentence_score['compound'] < -0.05:
            n_neg = n_neg+1
        else:
            n_neu = n_neu+1
        result_list.append((sentence_score, n_words))

    p_pos = 0
    p_neg = 0
    p_neu = 0
    senti_avg = 0
    n_sentence = 0
    for result in result_list:
        p_pos = p_pos + (result[0]['pos']*result[1]/n_total_words)
        p_neg = p_neg + (result[0]['neg']*result[1]/n_total_words)
        p_neu = p_neu + (result[0]['neu']*result[1]/n_total_words)
        senti_avg = senti_avg + result[0]['compound']
        n_sentence = n_sentence + 1
    if n_sentence != 0:
        senti_avg=senti_avg/n_sentence
    else:
        senti_avg=np.nan
        
    senti_dict['avg'] = senti_avg
    senti_dict['p_pos'] = p_pos
    senti_dict['p_neg'] = p_neg
    senti_dict['p_neu'] = p_neu
    senti_dict['n_pos'] = n_pos
    senti_dict['n_neg'] = n_neg
    senti_dict['n_neu'] = n_neu
    senti_dict['n_sentence'] = n_sentence
    senti_dict['n_words'] = n_total_words
    senti_dict['full sentence scores'] = result_list
    '''
    toc = time.time()
    print('Time used: {}'.format(toc-tic))
    print('Amount of words: {}'.format(n_total_words))
    print('Proportion:')
    print('    pos: {}\n    neg: {}\n    neu: {}'.format(p_pos,p_neg,p_neu))
    print('Number of ___ sentence:')
    print('    pos: {}\n    neg: {}\n    neu: {}'.format(n_pos,n_neg,n_neu))
    '''
    return senti_dict

In [7]:
path = 'D:/Thesis_data/sec_edgar_filings/'
tic = time.time()

n_company = len(os.listdir(path))
n=0
print('Amount of company: {}'.format(n_company))
for code in os.listdir(path):
    n = n+1
    if n%10 == 0:
        print('n = {}, {:.2f}%, Time used: {:.2f} sec.'.format(n, n/n_company*100, time.time()-tic))
    com_path = path + code
    if os.path.isdir(com_path + '/10-K/'):
        senti_dict = {}
        df = pd.DataFrame(pd.read_pickle(com_path + '/processed_final.pickle'))
        for _, row in df.iterrows():
            if type(row['CONFORMED DATE']) is float:
                continue
            year = int(row['CONFORMED DATE'][:4]) - 1
            senti_dict[year] = vader_score(row['mda'])
        with open('D:/Thesis_data/vader_score_full/{}.pickle'.format(code), 'wb') as f:
            pickle.dump(senti_dict, f)
print('Done.\n', 'Total time used: {:.2f} sec.'.format(time.time()-tic))

Amount of company: 5549
n = 10, 0.18%, Time used: 42.92 sec.
n = 20, 0.36%, Time used: 141.96 sec.
n = 30, 0.54%, Time used: 170.04 sec.
n = 40, 0.72%, Time used: 205.75 sec.
n = 50, 0.90%, Time used: 277.33 sec.
n = 60, 1.08%, Time used: 340.05 sec.
n = 70, 1.26%, Time used: 377.97 sec.
n = 80, 1.44%, Time used: 413.93 sec.
n = 90, 1.62%, Time used: 447.70 sec.
n = 100, 1.80%, Time used: 475.11 sec.
n = 110, 1.98%, Time used: 505.53 sec.
n = 120, 2.16%, Time used: 540.03 sec.
n = 130, 2.34%, Time used: 587.18 sec.
n = 140, 2.52%, Time used: 613.74 sec.
n = 150, 2.70%, Time used: 660.98 sec.
n = 160, 2.88%, Time used: 697.75 sec.
n = 170, 3.06%, Time used: 749.72 sec.
n = 180, 3.24%, Time used: 806.73 sec.
n = 190, 3.42%, Time used: 826.47 sec.
n = 200, 3.60%, Time used: 907.45 sec.
n = 210, 3.78%, Time used: 942.85 sec.
n = 220, 3.96%, Time used: 979.03 sec.
n = 230, 4.14%, Time used: 992.79 sec.
n = 240, 4.33%, Time used: 1017.61 sec.
n = 250, 4.51%, Time used: 1122.10 sec.
n = 260, 

n = 2000, 36.04%, Time used: 7073.01 sec.
n = 2010, 36.22%, Time used: 7165.75 sec.
n = 2020, 36.40%, Time used: 7187.86 sec.
n = 2030, 36.58%, Time used: 7220.23 sec.
n = 2040, 36.76%, Time used: 7244.31 sec.
n = 2050, 36.94%, Time used: 7272.61 sec.
n = 2060, 37.12%, Time used: 7334.88 sec.
n = 2070, 37.30%, Time used: 7357.60 sec.
n = 2080, 37.48%, Time used: 7383.43 sec.
n = 2090, 37.66%, Time used: 7414.08 sec.
n = 2100, 37.84%, Time used: 7421.80 sec.
n = 2110, 38.02%, Time used: 7470.07 sec.
n = 2120, 38.21%, Time used: 7489.63 sec.
n = 2130, 38.39%, Time used: 7501.28 sec.
n = 2140, 38.57%, Time used: 7535.18 sec.
n = 2150, 38.75%, Time used: 7645.99 sec.
n = 2160, 38.93%, Time used: 7681.38 sec.
n = 2170, 39.11%, Time used: 7702.05 sec.
n = 2180, 39.29%, Time used: 7723.69 sec.
n = 2190, 39.47%, Time used: 7742.37 sec.
n = 2200, 39.65%, Time used: 7776.26 sec.
n = 2210, 39.83%, Time used: 7792.92 sec.
n = 2220, 40.01%, Time used: 7803.65 sec.
n = 2230, 40.19%, Time used: 7814.

n = 3940, 71.00%, Time used: 13541.68 sec.
n = 3950, 71.18%, Time used: 13564.10 sec.
n = 3960, 71.36%, Time used: 13602.52 sec.
n = 3970, 71.54%, Time used: 13630.39 sec.
n = 3980, 71.72%, Time used: 13677.09 sec.
n = 3990, 71.90%, Time used: 13699.62 sec.
n = 4000, 72.09%, Time used: 13716.39 sec.
n = 4010, 72.27%, Time used: 13736.66 sec.
n = 4020, 72.45%, Time used: 13793.44 sec.
n = 4030, 72.63%, Time used: 13867.19 sec.
n = 4040, 72.81%, Time used: 13927.02 sec.
n = 4050, 72.99%, Time used: 13957.30 sec.
n = 4060, 73.17%, Time used: 13973.18 sec.
n = 4070, 73.35%, Time used: 14020.79 sec.
n = 4080, 73.53%, Time used: 14085.54 sec.
n = 4090, 73.71%, Time used: 14116.18 sec.
n = 4100, 73.89%, Time used: 14146.31 sec.
n = 4110, 74.07%, Time used: 14166.44 sec.
n = 4120, 74.25%, Time used: 14225.54 sec.
n = 4130, 74.43%, Time used: 14307.50 sec.
n = 4140, 74.61%, Time used: 14422.18 sec.
n = 4150, 74.79%, Time used: 14440.26 sec.
n = 4160, 74.97%, Time used: 14478.75 sec.
n = 4170, 7

In [35]:
path = 'D:/Thesis_data/sec_edgar_filings/'
tic = time.time()
senti_dict = {}
n_company = len(os.listdir(path))
n=0
print('Amount of company: {}'.format(n_company))
for code in os.listdir(path):
    n = n+1
    if n%10 == 0:
        print('n = {}, {:.2f}%, Time used: {:.2f} sec.'.format(n, n/n_company*100, time.time()-tic))
    com_path = path + code
    if os.path.isdir(com_path + '/10-K/'):
        senti_dict[code] = {}
        df = pd.DataFrame(pd.read_pickle(com_path + '/processed_final.pickle'))
        for _, row in df.iterrows():
            if type(row['CONFORMED DATE']) is float:
                continue
            year = int(row['CONFORMED DATE'][:4]) - 1
            senti_dict[code][year] = li_score(row['mda'])
print('Done.\n', 'Total time used: {:.2f} sec.'.format(time.time()-tic))

Amount of company: 5549
n = 10, 0.18%, Time used: 2.96 sec.
n = 20, 0.36%, Time used: 9.70 sec.
n = 30, 0.54%, Time used: 11.64 sec.
n = 40, 0.72%, Time used: 14.47 sec.
n = 50, 0.90%, Time used: 19.74 sec.
n = 60, 1.08%, Time used: 24.29 sec.
n = 70, 1.26%, Time used: 27.24 sec.
n = 80, 1.44%, Time used: 30.32 sec.
n = 90, 1.62%, Time used: 32.95 sec.
n = 100, 1.80%, Time used: 35.25 sec.
n = 110, 1.98%, Time used: 37.66 sec.
n = 120, 2.16%, Time used: 40.91 sec.
n = 130, 2.34%, Time used: 44.68 sec.
n = 140, 2.52%, Time used: 47.04 sec.
n = 150, 2.70%, Time used: 50.82 sec.
n = 160, 2.88%, Time used: 53.43 sec.
n = 170, 3.06%, Time used: 57.38 sec.
n = 180, 3.24%, Time used: 61.65 sec.
n = 190, 3.42%, Time used: 63.42 sec.
n = 200, 3.60%, Time used: 69.32 sec.
n = 210, 3.78%, Time used: 71.96 sec.
n = 220, 3.96%, Time used: 74.94 sec.
n = 230, 4.14%, Time used: 76.18 sec.
n = 240, 4.33%, Time used: 78.30 sec.
n = 250, 4.51%, Time used: 86.03 sec.
n = 260, 4.69%, Time used: 87.71 sec.

n = 2050, 36.94%, Time used: 600.03 sec.
n = 2060, 37.12%, Time used: 605.48 sec.
n = 2070, 37.30%, Time used: 607.42 sec.
n = 2080, 37.48%, Time used: 609.75 sec.
n = 2090, 37.66%, Time used: 612.53 sec.
n = 2100, 37.84%, Time used: 613.31 sec.
n = 2110, 38.02%, Time used: 617.68 sec.
n = 2120, 38.21%, Time used: 619.52 sec.
n = 2130, 38.39%, Time used: 620.82 sec.
n = 2140, 38.57%, Time used: 623.86 sec.
n = 2150, 38.75%, Time used: 634.10 sec.
n = 2160, 38.93%, Time used: 636.99 sec.
n = 2170, 39.11%, Time used: 638.91 sec.
n = 2180, 39.29%, Time used: 640.94 sec.
n = 2190, 39.47%, Time used: 642.60 sec.
n = 2200, 39.65%, Time used: 645.47 sec.
n = 2210, 39.83%, Time used: 647.09 sec.
n = 2220, 40.01%, Time used: 648.27 sec.
n = 2230, 40.19%, Time used: 649.35 sec.
n = 2240, 40.37%, Time used: 650.22 sec.
n = 2250, 40.55%, Time used: 653.04 sec.
n = 2260, 40.73%, Time used: 655.45 sec.
n = 2270, 40.91%, Time used: 657.46 sec.
n = 2280, 41.09%, Time used: 660.21 sec.
n = 2290, 41.27%

n = 4040, 72.81%, Time used: 1128.34 sec.
n = 4050, 72.99%, Time used: 1130.68 sec.
n = 4060, 73.17%, Time used: 1131.79 sec.
n = 4070, 73.35%, Time used: 1134.84 sec.
n = 4080, 73.53%, Time used: 1138.83 sec.
n = 4090, 73.71%, Time used: 1140.78 sec.
n = 4100, 73.89%, Time used: 1142.69 sec.
n = 4110, 74.07%, Time used: 1144.13 sec.
n = 4120, 74.25%, Time used: 1147.79 sec.
n = 4130, 74.43%, Time used: 1153.29 sec.
n = 4140, 74.61%, Time used: 1160.13 sec.
n = 4150, 74.79%, Time used: 1161.50 sec.
n = 4160, 74.97%, Time used: 1163.99 sec.
n = 4170, 75.15%, Time used: 1168.43 sec.
n = 4180, 75.33%, Time used: 1170.87 sec.
n = 4190, 75.51%, Time used: 1175.26 sec.
n = 4200, 75.69%, Time used: 1180.93 sec.
n = 4210, 75.87%, Time used: 1182.69 sec.
n = 4220, 76.05%, Time used: 1193.05 sec.
n = 4230, 76.23%, Time used: 1194.28 sec.
n = 4240, 76.41%, Time used: 1197.19 sec.
n = 4250, 76.59%, Time used: 1199.62 sec.
n = 4260, 76.77%, Time used: 1201.67 sec.
n = 4270, 76.95%, Time used: 1207.

In [10]:
test_path = 'D:/Thesis_data/TEST2/66740/processed.pickle'
df = pd.DataFrame(pd.read_pickle(test_path))
test_mda = df.loc[0,'mda']
df.head()

Unnamed: 0,CONFORMED DATE,FILE DATE,cik,mda,name,sic,subtype,yearend
0,20041231,20050224,66740,Item 7. Managements Discussion and Analysis of...,3M CO,2670,10-K,1231
1,20051231,20060221,66740,Item 7. Managements Discussion and Analysis of...,3M CO,2670,10-K,1231
2,20061231,20070226,66740,Item 7. Managements Discussion and Analysis of...,3M CO,3841,10-K,1231
3,20071231,20080215,66740,Item 7. Managements Discussion and Analysis of...,3M CO,3841,10-K,1231
4,20081231,20090217,66740,"Item 7, Managements Discussion and Analysis of...",3M CO,3841,10-K,1231


In [24]:
def li_score(mda):
    mda = clean_mda(mda)
    counts = Counter(mda.split())
    return counts['risk'] + counts['risks'] + counts['risky'] + counts['uncertain'] + counts['uncertainty'] + counts['uncertainties']

In [13]:
test_mda = clean_mda(test_mda)

In [28]:
tic = time.time()
df['mda'].map(li_score)
print(time.time()-tic)

0.5377483367919922


In [29]:
tic = time.time()
result_list = []
for _, row in df.iterrows():
    row['li_score'] = li_score(row['mda'])
    result_list.append(row)
pd.DataFrame(result_list)
print(time.time()-tic)

0.5686945915222168


In [38]:
with open('D:/Thesis_data/li_n_words_dict.pickle', 'wb') as f:
    pickle.dump(senti_dict, f)

In [39]:
senti_dict

{'1000045': {2005: 4,
  2006: 4,
  2007: 4,
  2008: 15,
  2009: 12,
  2010: 16,
  2011: 12,
  2012: 15,
  2013: 16,
  2014: 12,
  2015: 8,
  2016: 9,
  2017: 9,
  2018: 15},
 '1000180': {2005: 5,
  2004: 36,
  2006: 6,
  2007: 10,
  2009: 9,
  2010: 9,
  2011: 11,
  2012: 11,
  2013: 13,
  2015: 0},
 '1000209': {2003: 32,
  2004: 20,
  2005: 0,
  2006: 19,
  2007: 17,
  2008: 0,
  2009: 20,
  2010: 18,
  2011: 18,
  2012: 21,
  2013: 22,
  2014: 21,
  2015: 21,
  2016: 18,
  2017: 13},
 '1000228': {2003: 31,
  2004: 19,
  2005: 18,
  2006: 20,
  2007: 21,
  2008: 22,
  2009: 22,
  2010: 21,
  2011: 21,
  2012: 22,
  2013: 21,
  2014: 23,
  2015: 28,
  2017: 31},
 '1000229': {2003: 12,
  2004: 12,
  2005: 15,
  2006: 18,
  2007: 18,
  2008: 20,
  2009: 19,
  2010: 32,
  2011: 32,
  2012: 32,
  2013: 32,
  2014: 32,
  2015: 36,
  2016: 34,
  2017: 34},
 '1000234': {2003: 110, 2004: 144, 2005: 103, 2006: 113, 2007: 199},
 '1000623': {2010: 76,
  2011: 80,
  2012: 96,
  2013: 103,
  2014: 

In [43]:
score_dict = {}
for code, year_dict in senti_dict.items():
    score_dict[code]={}
    for year, senti_score in year_dict.items():
        score_dict[code][year] = np.log(1+senti_score)
        

In [44]:
score_dict

{'1000045': {2005: 1.6094379124341003,
  2006: 1.6094379124341003,
  2007: 1.6094379124341003,
  2008: 2.772588722239781,
  2009: 2.5649493574615367,
  2010: 2.833213344056216,
  2011: 2.5649493574615367,
  2012: 2.772588722239781,
  2013: 2.833213344056216,
  2014: 2.5649493574615367,
  2015: 2.1972245773362196,
  2016: 2.302585092994046,
  2017: 2.302585092994046,
  2018: 2.772588722239781},
 '1000180': {2005: 1.791759469228055,
  2004: 3.6109179126442243,
  2006: 1.9459101490553132,
  2007: 2.3978952727983707,
  2009: 2.302585092994046,
  2010: 2.302585092994046,
  2011: 2.4849066497880004,
  2012: 2.4849066497880004,
  2013: 2.6390573296152584,
  2015: 0.0},
 '1000209': {2003: 3.4965075614664802,
  2004: 3.044522437723423,
  2005: 0.0,
  2006: 2.995732273553991,
  2007: 2.8903717578961645,
  2008: 0.0,
  2009: 3.044522437723423,
  2010: 2.9444389791664403,
  2011: 2.9444389791664403,
  2012: 3.091042453358316,
  2013: 3.1354942159291497,
  2014: 3.091042453358316,
  2015: 3.0910424

In [46]:
with open('D:/Thesis_data/li_score.pickle', 'wb') as f:
    pickle.dump(score_dict, f)

In [3]:
tic = time.time()
score_df_path = 'D:/Thesis_data/vader_score_full/'
senti_dict={}
n_company = len(os.listdir(score_df_path))
n = 0
for code in os.listdir(score_df_path):
    n += 1
    if n%100==0:
        print('n = {}, time used: {}sec.'.format(n, time.time()-tic))
    current_path = score_df_path+'{}'.format(code)
    company_dict = pd.read_pickle(current_path)
    code = code[:-7]
    for year in company_dict.keys():
        current_dict = company_dict[year]
        n_sentence = current_dict['n_sentence']
        current_full_score_list = current_dict.pop('full sentence scores')
        if n_sentence == 0: 
            continue        
        n_pos_sentence = 0
        n_neg_sentence = 0
        for score, _ in current_full_score_list:
            if score['compound'] > 0.05:
                n_pos_sentence += 1
            elif score['compound'] < -0.05:
                n_neg_sentence += 1
        current_dict['p_pos_sentence'] = n_pos_sentence / n_sentence
        current_dict['p_neg_sentence'] = n_neg_sentence / n_sentence
        
    senti_dict[code] = company_dict

n = 100, time used: 5.26952338218689sec.
n = 200, time used: 8.380579471588135sec.
n = 300, time used: 11.2501540184021sec.
n = 400, time used: 14.666954517364502sec.
n = 500, time used: 17.307631731033325sec.
n = 600, time used: 20.174554109573364sec.
n = 700, time used: 22.782769680023193sec.
n = 800, time used: 25.466068029403687sec.
n = 900, time used: 29.902340173721313sec.
n = 1000, time used: 31.980546474456787sec.
n = 1100, time used: 34.54826593399048sec.
n = 1200, time used: 37.50021934509277sec.
n = 1300, time used: 39.936105728149414sec.
n = 1400, time used: 42.677166223526sec.
n = 1500, time used: 45.36375713348389sec.
n = 1600, time used: 47.773616552352905sec.
n = 1700, time used: 50.53926110267639sec.
n = 1800, time used: 53.266326665878296sec.
n = 1900, time used: 55.97518491744995sec.
n = 2000, time used: 58.07228326797485sec.
n = 2100, time used: 60.279900550842285sec.
n = 2200, time used: 62.16568160057068sec.
n = 2300, time used: 63.867822885513306sec.
n = 2400, ti

In [5]:
with open('D:/Thesis_data/vader_score_dict.pickle', 'wb') as f:
    pickle.dump(senti_dict, f)

In [17]:
current_dict[2014]

{'avg': 0.076337037037037,
 'p_pos': 0.08587693752935652,
 'p_neg': 0.04830624706434944,
 'p_neu': 0.8658230781274462,
 'n_pos': 130,
 'n_neg': 67,
 'n_neu': 127,
 'n_sentence': 324,
 'n_words': 6387,
 'full sentence scores': [({'neg': 0.0,
    'neu': 1.0,
    'pos': 0.0,
    'compound': 0.0},
   26),
  ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, 16),
  ({'neg': 0.153, 'neu': 0.847, 'pos': 0.0, 'compound': -0.5423}, 27),
  ({'neg': 0.132, 'neu': 0.819, 'pos': 0.05, 'compound': -0.5108}, 37),
  ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, 38),
  ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, 31),
  ({'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}, 28),
  ({'neg': 0.091, 'neu': 0.909, 'pos': 0.0, 'compound': -0.296}, 24),
  ({'neg': 0.151, 'neu': 0.735, 'pos': 0.113, 'compound': -0.3034}, 29),
  ({'neg': 0.154, 'neu': 0.846, 'pos': 0.0, 'compound': -0.4939}, 25),
  ({'neg': 0.086, 'neu': 0.914, 'pos': 0.0, 'compound': -0.128}, 17),
  ({'neg': 0.0, 'n

In [4]:
senti_dict

{'1000045': {2005: {'avg': 0.11780033333333335,
   'p_pos': 0.10186682663467303,
   'p_neg': 0.051813837232553506,
   'p_neu': 0.8463301339732052,
   'n_pos': 114,
   'n_neg': 53,
   'n_neu': 133,
   'n_sentence': 300,
   'n_words': 5001,
   'p_pos_sentence': 0.38,
   'p_neg_sentence': 0.17666666666666667},
  2006: {'avg': 0.1430891228070175,
   'p_pos': 0.10886173826173823,
   'p_neg': 0.051494105894105906,
   'p_neu': 0.8396193806193807,
   'n_pos': 119,
   'n_neg': 49,
   'n_neu': 117,
   'n_sentence': 285,
   'n_words': 5005,
   'p_pos_sentence': 0.41754385964912283,
   'p_neg_sentence': 0.17192982456140352},
  2007: {'avg': 0.12322992424242418,
   'p_pos': 0.10236204942189976,
   'p_neg': 0.048875765132622984,
   'p_neu': 0.8487705735660851,
   'n_pos': 99,
   'n_neg': 42,
   'n_neu': 123,
   'n_sentence': 264,
   'n_words': 4411,
   'p_pos_sentence': 0.375,
   'p_neg_sentence': 0.1590909090909091},
  2008: {'avg': 0.16015831509846834,
   'p_pos': 0.11379079299079291,
   'p_neg': 

In [6]:
tic = time.time()
score_df_path = 'D:/Thesis_data/vader_score_full/'
senti_dict={}
n_company = len(os.listdir(score_df_path))
n = 0
for code in os.listdir(score_df_path):
    n += 1
    if n%100==0:
        print('n = {}, time used: {}sec.'.format(n, time.time()-tic))
    current_path = score_df_path+'{}'.format(code)
    company_dict = pd.read_pickle(current_path)
    code = code[:-7]
    for year in company_dict.keys():
        current_dict = company_dict[year]
        n_sentence = current_dict['n_sentence']
        current_full_score_list = current_dict.pop('full sentence scores')
        if n_sentence == 0: 
            continue        
        sum_pos = 0
        sum_neg = 0
        n_pos_sentence = 0
        n_neg_sentence = 0
        for score, _ in current_full_score_list:
            if score['compound'] > 0.05:
                n_pos_sentence += 1
                sum_pos += score['compound']
            elif score['compound'] < -0.05:
                n_neg_sentence += 1
                sum_neg += score['compound']
        current_dict['p_pos_sentence'] = n_pos_sentence / n_sentence
        current_dict['p_neg_sentence'] = n_neg_sentence / n_sentence
        if n_pos_sentence == 0:
            current_dict['avg_pos'] = 0
        else:
            current_dict['avg_pos'] = sum_pos / n_pos_sentence
        if n_neg_sentence == 0:
            current_dict['avg_neg'] = 0
        else:
            current_dict['avg_neg'] = sum_neg / n_neg_sentence
        
    senti_dict[code] = company_dict

n = 100, time used: 4.696682453155518sec.
n = 200, time used: 9.71454644203186sec.
n = 300, time used: 14.934211254119873sec.
n = 400, time used: 21.25156855583191sec.
n = 500, time used: 26.488974571228027sec.
n = 600, time used: 31.486905813217163sec.
n = 700, time used: 36.4833242893219sec.
n = 800, time used: 41.50570464134216sec.
n = 900, time used: 48.96477556228638sec.
n = 1000, time used: 53.00117826461792sec.
n = 1100, time used: 58.257466554641724sec.
n = 1200, time used: 64.24027752876282sec.
n = 1300, time used: 69.12930488586426sec.
n = 1400, time used: 74.85631084442139sec.
n = 1500, time used: 80.39062857627869sec.
n = 1600, time used: 85.10663318634033sec.
n = 1700, time used: 90.6379988193512sec.
n = 1800, time used: 96.0866105556488sec.
n = 1900, time used: 101.28082609176636sec.
n = 2000, time used: 105.37534928321838sec.
n = 2100, time used: 110.06254553794861sec.
n = 2200, time used: 113.84807300567627sec.
n = 2300, time used: 117.44666957855225sec.
n = 2400, time 

In [3]:
senti_dict = pd.read_pickle('D:/Thesis_data/vader_score_dict.pickle')

In [8]:
with open('D:/Thesis_data/vader_score_dict_200106.pickle', 'wb') as f:
    pickle.dump(senti_dict, f)