### 1. Environment Setup

In [1]:
# load libraries
#import html_search_keyword
import text_search
import pandas as pd

In [2]:
# define keywords

## DCF method keywords
dcf = ['dcf', 'discounted cash flow', 'free cash flow', 'net present value', 'npv', 'cost of capital','cost of equity','WACC','weighted average cost of capital','CAPM','capital asset pricing model', 'revenue forecast', 'forecasted revenue', 'forecasted revenues', 'expense forecast', 'forecasted expense', 'forecasted expenses', 'capital asset', 'capital assets']

## Comparable method keywords
# different kind of comparables based valuation
comps = ['relative valuation', 'trading multiple', 'trading multiples', 'market multiple', 'market multiples', 'equity comparables', 'equity comps', 'comparable companies','comparable company','peer company','peer companies','peer group'] 
ev_rev = ['EV/Revenue', 'EV / Revenue', 'Enterprise Value/Revenue', 'Enterprise Value / Revenue', 'ev to revenue', 'enterprise value to revenue']
ev_gp = ['EV/Gross Profit', 'EV / Gross Profit', 'Enterprise Value/Gross Profit', 'Enterprise Value / Gross Profit', 'ev to gross profit', 'enterprise value to gross profit']
ev_ebitda = ['EV/EBITDA', 'EV / EBITDA', 'Enterprise Value/EBITDA', 'Enterprise Value / EBITDA', 'ev to EBITDA', 'enterprise value to EBITDA', 'enterprise value to earning before interest']
p_e = ['P/E', 'P / E', 'price to earnings']
p_nav = ['P/NAV', 'P / NAV', 'price to net asset value']
p_b = ['P/B', 'P / B', 'price to book']

#comps_ratios = ['EV/Revenue', 'EV/Gross Profit', 'EV/EBITDA', 'P/E', 'P/NAV', 'P/B', 'ratio']
comps_ratios = ev_rev + ev_gp + ev_ebitda + p_e + p_nav + p_b
cmp = comps + comps_ratios

### 2. Sample Search

In [3]:
# get list of files with keywords
example = text_search.find_keyword_in_files(dcf, "data", "2008-12-01", "2008-12-31")

Searching file: data/2008-12-02_1060755.html
Searching file: data/2008-12-02_930364.html
Searching file: data/2008-12-04_1335293.html
Searching file: data/2008-12-05_895993.html
Searching file: data/2008-12-08_1025134.html
Searching file: data/2008-12-08_1050031.html
Searching file: data/2008-12-08_21212.html
Searching file: data/2008-12-10_1165460.html
Searching file: data/2008-12-11_1098074.html
Searching file: data/2008-12-15_927829.html
Searching file: data/2008-12-17_877902.html
Searching file: data/2008-12-22_1013706.html
Searching file: data/2008-12-22_811830.html
Searching file: data/2008-12-23_1017137.html
Searching file: data/2008-12-24_1086774.html
Searching file: data/2008-12-24_842640.html


In [4]:
# look into keywords hit in a file
example['data/2008-12-04_1335293.html'].keys()

dict_keys(['discounted cash flow', 'cost of capital', 'cost of equity', 'WACC', 'weighted average cost of capital', 'CAPM', 'capital asset pricing model', 'revenue forecast', 'forecasted revenue', 'forecasted revenues', 'expense forecast', 'capital asset'])

In [5]:
# look into detailed text of a keyword hit
#example['data/2008-12-04_1335293.html']['cost of capital'][0]

In [6]:
# count number of key words per file
for i in example.keys():
    print(i, len(list(example[i].keys())))

data/2008-12-02_1060755.html 4
data/2008-12-02_930364.html 4
data/2008-12-04_1335293.html 12
data/2008-12-05_895993.html 2
data/2008-12-08_1025134.html 2
data/2008-12-08_1050031.html 6
data/2008-12-08_21212.html 3
data/2008-12-10_1165460.html 4
data/2008-12-11_1098074.html 7
data/2008-12-15_927829.html 4
data/2008-12-17_877902.html 11
data/2008-12-22_1013706.html 2
data/2008-12-22_811830.html 3
data/2008-12-23_1017137.html 5
data/2008-12-24_1086774.html 5
data/2008-12-24_842640.html 2


In [7]:
# make summary stats to a dataframe
cols = ['year','month','filename', 'dcf_freq','dcf_hit']
lst = []
for i in example.keys():
    lst.append([i[5:9], i[10:12], i, len(list(example[i].keys())), True if len(list(example[i].keys()))>0 else False])
df = pd.DataFrame(lst, columns=cols)
df

Unnamed: 0,year,month,filename,dcf_freq,dcf_hit
0,2008,12,data/2008-12-02_1060755.html,4,True
1,2008,12,data/2008-12-02_930364.html,4,True
2,2008,12,data/2008-12-04_1335293.html,12,True
3,2008,12,data/2008-12-05_895993.html,2,True
4,2008,12,data/2008-12-08_1025134.html,2,True
5,2008,12,data/2008-12-08_1050031.html,6,True
6,2008,12,data/2008-12-08_21212.html,3,True
7,2008,12,data/2008-12-10_1165460.html,4,True
8,2008,12,data/2008-12-11_1098074.html,7,True
9,2008,12,data/2008-12-15_927829.html,4,True


### 3. Analysis on valuation methods over the years

In [8]:
# summerizing dcf stats for all months from 2008 to 2018
dcfstat = {}
for y in range(1995,2019):
    dcfstat["year{0}_dcf".format(y)] = text_search.find_keyword_in_files(dcf, "data", str(y)+"-01-01", str(y)+"-12-31")

Searching file: data/1995-02-10_60026.txt
Searching file: data/1995-02-10_897599.txt
Searching file: data/1995-03-13_63506.txt
Searching file: data/1995-03-16_106015.txt
Searching file: data/1995-03-16_864328.txt
Searching file: data/1995-03-29_79920.txt
Searching file: data/1995-03-30_920148.txt
Searching file: data/1995-04-04_4960.txt
Searching file: data/1995-04-10_351547.txt
Searching file: data/1995-04-18_726957.txt
Searching file: data/1995-04-25_858339.txt
Searching file: data/1995-04-26_847322.txt
Searching file: data/1995-04-28_814577.txt
Searching file: data/1995-05-04_65660.txt
Searching file: data/1995-05-09_724176.txt
Searching file: data/1995-05-15_319767.txt
Searching file: data/1995-06-02_65771.txt
Searching file: data/1995-06-02_850134.txt
Searching file: data/1995-06-05_701382.txt
Searching file: data/1995-06-14_13006.txt
Searching file: data/1995-06-14_351993.txt
Searching file: data/1995-06-14_775272.txt
Searching file: data/1995-06-28_726606.txt
Searching file: dat

Searching file: data/1996-11-12_914122.txt
Searching file: data/1996-11-12_933136.txt
Searching file: data/1996-11-12_949058.txt
Searching file: data/1996-11-14_811714.txt
Searching file: data/1996-11-15_840016.txt
Searching file: data/1996-11-18_846931.txt
Searching file: data/1996-11-19_898623.txt
Searching file: data/1996-11-20_36537.txt
Searching file: data/1996-11-20_37481.txt
Searching file: data/1996-11-20_66895.txt
Searching file: data/1996-11-20_891103.txt
Searching file: data/1996-11-21_837942.txt
Searching file: data/1996-11-22_755659.txt
Searching file: data/1996-11-25_897732.txt
Searching file: data/1996-11-27_32198.txt
Searching file: data/1996-11-27_944740.txt
Searching file: data/1996-11-29_910648.txt
Searching file: data/1996-11-29_929989.txt
Searching file: data/1996-12-03_5513.txt
Searching file: data/1996-12-06_848865.txt
Searching file: data/1996-12-23_929989.txt
Searching file: data/1997-01-10_795879.txt
Searching file: data/1997-01-10_949008.txt
Searching file: d

Searching file: data/1997-12-29_892833.txt
Searching file: data/1997-12-29_936888.txt
Searching file: data/1997-12-31_790966.txt
Searching file: data/1997-12-31_829117.txt
Searching file: data/1998-01-05_1007997.txt
Searching file: data/1998-01-05_945429.txt
Searching file: data/1998-01-06_64247.txt
Searching file: data/1998-01-06_879465.txt
Searching file: data/1998-01-06_890096.txt
Searching file: data/1998-01-07_86135.txt
Searching file: data/1998-01-13_828808.txt
Searching file: data/1998-01-13_891103.txt
Searching file: data/1998-01-14_1003110.txt
Searching file: data/1998-01-14_67494.txt
Searching file: data/1998-01-14_841074.txt
Searching file: data/1998-01-14_899647.txt
Searching file: data/1998-01-20_1019254.txt
Searching file: data/1998-01-20_784681.txt
Searching file: data/1998-01-20_79259.txt
Searching file: data/1998-01-21_860731.txt
Searching file: data/1998-01-27_72945.txt
Searching file: data/1998-01-27_886171.txt
Searching file: data/1998-01-27_928451.txt
Searching fil

Searching file: data/1998-10-26_351601.txt
Searching file: data/1998-10-30_1046676.txt
Searching file: data/1998-11-04_1007518.txt
Searching file: data/1998-11-06_837330.txt
Searching file: data/1998-11-09_795090.txt
Searching file: data/1998-11-10_1017362.txt
Searching file: data/1998-11-10_855932.txt
Searching file: data/1998-11-12_1017829.txt
Searching file: data/1998-11-12_1071645.txt
Searching file: data/1998-11-12_311174.txt
Searching file: data/1998-11-12_316554.txt
Searching file: data/1998-11-12_317859.txt
Searching file: data/1998-11-12_700664.txt
Searching file: data/1998-11-12_737876.txt
Searching file: data/1998-11-13_818155.txt
Searching file: data/1998-11-13_925052.txt
Searching file: data/1998-11-17_785557.txt
Searching file: data/1998-11-18_1003007.txt
Searching file: data/1998-11-19_774557.txt
Searching file: data/1998-11-19_782975.txt
Searching file: data/1998-11-19_789292.txt
Searching file: data/1998-11-20_81764.txt
Searching file: data/1998-11-24_775298.txt
Search

Searching file: data/1999-08-26_1025557.txt
Searching file: data/1999-08-27_842915.txt
Searching file: data/1999-08-31_1061399.txt
Searching file: data/1999-08-31_4310.txt
Searching file: data/1999-09-01_1059259.txt
Searching file: data/1999-09-01_39547.txt
Searching file: data/1999-09-01_890568.txt
Searching file: data/1999-09-03_869446.txt
Searching file: data/1999-09-07_798539.txt
Searching file: data/1999-09-09_851729.txt
Searching file: data/1999-09-10_804073.txt
Searching file: data/1999-09-14_1009532.txt
Searching file: data/1999-09-14_716133.txt
Searching file: data/1999-09-14_78066.txt
Searching file: data/1999-09-16_1006370.txt
Searching file: data/1999-09-16_101911.txt
Searching file: data/1999-09-17_1010856.txt
Searching file: data/1999-09-17_78536.txt
Searching file: data/1999-09-20_1046389.txt
Searching file: data/1999-09-21_1092395.txt
Searching file: data/1999-09-22_949881.txt
Searching file: data/1999-09-24_832320.txt
Searching file: data/1999-09-27_776867.txt
Searchin

Searching file: data/2000-04-14_764403.txt
Searching file: data/2000-04-14_889427.txt
Searching file: data/2000-04-17_1010138.txt
Searching file: data/2000-04-21_861388.txt
Searching file: data/2000-04-25_771729.txt
Searching file: data/2000-04-25_818968.txt
Searching file: data/2000-04-26_102993.txt
Searching file: data/2000-04-26_1030615.txt
Searching file: data/2000-04-26_838171.txt
Searching file: data/2000-04-28_745469.txt
Searching file: data/2000-05-01_1044738.txt
Searching file: data/2000-05-01_793933.txt
Searching file: data/2000-05-02_1038078.txt
Searching file: data/2000-05-02_923284.txt
Searching file: data/2000-05-03_1096841.txt
Searching file: data/2000-05-03_845434.txt
Searching file: data/2000-05-04_316709.txt
Searching file: data/2000-05-05_42246.txt
Searching file: data/2000-05-05_727347.txt
Searching file: data/2000-05-05_824740.txt
Searching file: data/2000-05-05_913771.txt
Searching file: data/2000-05-05_925260.txt
Searching file: data/2000-05-08_1030341.txt
Search

Searching file: data/2000-12-21_1019265.txt
Searching file: data/2000-12-22_776848.txt
Searching file: data/2000-12-26_709335.txt
Searching file: data/2000-12-28_1016169.txt
Searching file: data/2000-12-28_827795.txt
Searching file: data/2000-12-28_904080.txt
Searching file: data/2000-12-29_1085818.txt
Searching file: data/2001-01-02_1094231.txt
Searching file: data/2001-01-02_352683.txt
Searching file: data/2001-01-02_926861.txt
Searching file: data/2001-01-03_1025742.txt
Searching file: data/2001-01-03_947220.txt
Searching file: data/2001-01-04_841939.txt
Searching file: data/2001-01-05_1034898.txt
Searching file: data/2001-01-09_869554.txt
Searching file: data/2001-01-10_1070321.txt
Searching file: data/2001-01-10_846729.txt
Searching file: data/2001-01-12_1070518.txt
Searching file: data/2001-01-17_1013564.txt
Searching file: data/2001-01-23_1039311.txt
Searching file: data/2001-01-23_355627.txt
Searching file: data/2001-01-23_919006.txt
Searching file: data/2001-01-25_96638.txt
Se

Searching file: data/2001-10-15_889409.txt
Searching file: data/2001-10-17_1063257.txt
Searching file: data/2001-10-17_41850.txt
Searching file: data/2001-10-18_700612.txt
Searching file: data/2001-10-18_835405.txt
Searching file: data/2001-10-19_729365.txt
Searching file: data/2001-10-19_796226.txt
Searching file: data/2001-10-23_1019731.txt
Searching file: data/2001-10-25_1097070.txt
Searching file: data/2001-10-25_919568.txt
Searching file: data/2001-10-26_1012369.txt
Searching file: data/2001-10-30_1002119.txt
Searching file: data/2001-10-31_1050893.txt
Searching file: data/2001-11-01_1035426.txt
Searching file: data/2001-11-02_1000301.txt
Searching file: data/2001-11-02_902789.txt
Searching file: data/2001-11-05_915803.txt
Searching file: data/2001-11-07_1077771.html
Searching file: data/2001-11-07_879116.html
Searching file: data/2001-11-08_1012123.txt
Searching file: data/2001-11-08_1064122.html
Searching file: data/2001-11-08_313749.html
Searching file: data/2001-11-08_889409.t

Searching file: data/2002-11-06_949301.html
Searching file: data/2002-11-07_745113.txt
Searching file: data/2002-11-08_75049.txt
Searching file: data/2002-11-12_855711.html
Searching file: data/2002-11-15_1005126.html
Searching file: data/2002-11-15_888245.txt
Searching file: data/2002-11-18_1014955.txt
Searching file: data/2002-11-18_1029784.html
Searching file: data/2002-11-20_818999.txt
Searching file: data/2002-11-20_905897.txt
Searching file: data/2002-11-21_1002044.txt
Searching file: data/2002-11-22_942615.html
Searching file: data/2002-11-26_1029688.html
Searching file: data/2002-11-26_897743.txt
Searching file: data/2002-11-27_318259.html
Searching file: data/2002-12-02_1052196.txt
Searching file: data/2002-12-06_1110903.html
Searching file: data/2002-12-09_1101239.html
Searching file: data/2002-12-16_95676.html
Searching file: data/2002-12-20_1094335.html
Searching file: data/2002-12-20_779282.html
Searching file: data/2002-12-23_722056.html
Searching file: data/2002-12-23_75

Searching file: data/2003-11-18_1047262.html
Searching file: data/2003-11-19_1051825.txt
Searching file: data/2003-11-19_1102752.html
Searching file: data/2003-11-20_1105503.html
Searching file: data/2003-11-20_752692.txt
Searching file: data/2003-11-24_732439.txt
Searching file: data/2003-11-24_889899.txt
Searching file: data/2003-11-26_875354.txt
Searching file: data/2003-11-28_1108520.txt
Searching file: data/2003-12-01_720026.html
Searching file: data/2003-12-02_65270.html
Searching file: data/2003-12-03_1078425.html
Searching file: data/2003-12-04_863441.txt
Searching file: data/2003-12-10_930454.txt
Searching file: data/2003-12-16_906473.txt
Searching file: data/2003-12-19_830157.txt
Searching file: data/2003-12-19_892832.txt
Searching file: data/2003-12-23_1020359.txt
Searching file: data/2003-12-23_1043236.html
Searching file: data/2003-12-29_796502.txt
Searching file: data/2003-12-29_924645.html
Searching file: data/2003-12-30_1018332.html
Searching file: data/2003-12-31_84581

Searching file: data/2004-11-12_1023398.txt
Searching file: data/2004-11-12_1095099.html
Searching file: data/2004-11-12_835412.txt
Searching file: data/2004-11-15_1020905.html
Searching file: data/2004-11-16_1016937.html
Searching file: data/2004-11-16_1023771.html
Searching file: data/2004-11-16_1023772.html
Searching file: data/2004-11-16_1037897.html
Searching file: data/2004-11-16_1217286.html
Searching file: data/2004-11-16_763852.html
Searching file: data/2004-11-16_803747.html
Searching file: data/2004-11-16_814078.html
Searching file: data/2004-11-16_894088.html
Searching file: data/2004-11-16_894089.html
Searching file: data/2004-11-16_913829.html
Searching file: data/2004-11-16_919034.html
Searching file: data/2004-11-19_1026816.html
Searching file: data/2004-11-22_929647.html
Searching file: data/2004-11-23_1006614.html
Searching file: data/2004-11-24_722077.html
Searching file: data/2004-11-26_1269036.txt
Searching file: data/2004-11-30_929037.html
Searching file: data/200

Searching file: data/2005-08-22_870760.html
Searching file: data/2005-08-24_930796.txt
Searching file: data/2005-08-25_1088244.html
Searching file: data/2005-08-29_750901.html
Searching file: data/2005-08-29_913782.html
Searching file: data/2005-08-30_1061881.html
Searching file: data/2005-08-30_1063537.html
Searching file: data/2005-08-30_1089613.html
Searching file: data/2005-08-31_1132327.html
Searching file: data/2005-09-01_723527.html
Searching file: data/2005-09-02_1110009.html
Searching file: data/2005-09-02_912960.txt
Searching file: data/2005-09-06_1027258.html
Searching file: data/2005-09-07_1141363.html
Searching file: data/2005-09-07_860451.html
Searching file: data/2005-09-08_916823.html
Searching file: data/2005-09-09_1133862.html
Searching file: data/2005-09-13_933972.html
Searching file: data/2005-09-14_1037975.html
Searching file: data/2005-09-14_1038363.html
Searching file: data/2005-09-14_1082337.html
Searching file: data/2005-09-14_1089932.html
Searching file: data/

Searching file: data/2006-05-26_812152.txt
Searching file: data/2006-05-31_1040660.html
Searching file: data/2006-05-31_1114868.txt
Searching file: data/2006-06-01_874992.html
Searching file: data/2006-06-01_893577.html
Searching file: data/2006-06-01_908440.html
Searching file: data/2006-06-02_732713.html
Searching file: data/2006-06-05_1013467.html
Searching file: data/2006-06-06_1287808.html
Searching file: data/2006-06-07_1097641.html
Searching file: data/2006-06-08_1099674.html
Searching file: data/2006-06-09_1205431.html
Searching file: data/2006-06-12_767673.txt
Searching file: data/2006-06-12_867490.html
Searching file: data/2006-06-14_884498.html
Searching file: data/2006-06-16_787784.html
Searching file: data/2006-06-19_355787.html
Searching file: data/2006-06-19_85812.html
Searching file: data/2006-06-21_702259.html
Searching file: data/2006-06-22_878903.html
Searching file: data/2006-06-26_1184818.html
Searching file: data/2006-06-27_215403.html
Searching file: data/2006-06

Searching file: data/2006-12-22_858558.html
Searching file: data/2006-12-26_801873.html
Searching file: data/2006-12-29_1017440.html
Searching file: data/2006-12-29_1038339.html
Searching file: data/2006-12-29_913293.html
Searching file: data/2007-01-03_1012482.html
Searching file: data/2007-01-03_1070764.html
Searching file: data/2007-01-04_803027.html
Searching file: data/2007-01-04_820081.html
Searching file: data/2007-01-05_1032462.html
Searching file: data/2007-01-05_1037275.html
Searching file: data/2007-01-05_1056874.html
Searching file: data/2007-01-05_23019.html
Searching file: data/2007-01-09_750813.html
Searching file: data/2007-01-10_1125011.html
Searching file: data/2007-01-11_1033864.html
Searching file: data/2007-01-12_1113107.html
Searching file: data/2007-01-12_1327098.html
Searching file: data/2007-01-16_887637.html
Searching file: data/2007-01-18_64908.html
Searching file: data/2007-01-22_1060939.txt
Searching file: data/2007-01-22_1099728.txt
Searching file: data/20

Searching file: data/2007-07-12_1060409.txt
Searching file: data/2007-07-13_1156826.html
Searching file: data/2007-07-13_1332585.html
Searching file: data/2007-07-13_726513.html
Searching file: data/2007-07-13_899171.html
Searching file: data/2007-07-13_94887.html
Searching file: data/2007-07-16_1048611.html
Searching file: data/2007-07-16_1091312.html
Searching file: data/2007-07-16_12239.html
Searching file: data/2007-07-17_37651.html
Searching file: data/2007-07-17_894738.html
Searching file: data/2007-07-18_1032033.html
Searching file: data/2007-07-20_1137047.html
Searching file: data/2007-07-24_65873.html
Searching file: data/2007-07-24_723928.html
Searching file: data/2007-07-24_830141.html
Searching file: data/2007-07-25_1001604.html
Searching file: data/2007-07-25_1023291.html
Searching file: data/2007-07-27_39135.html
Searching file: data/2007-07-27_893430.html
Searching file: data/2007-07-31_1124887.html
Searching file: data/2007-08-01_1158269.html
Searching file: data/2007-0

Searching file: data/2008-02-25_66025.html
Searching file: data/2008-02-29_907250.html
Searching file: data/2008-03-03_1031798.html
Searching file: data/2008-03-05_875623.html
Searching file: data/2008-03-07_1171218.html
Searching file: data/2008-03-10_1325098.html
Searching file: data/2008-03-13_743532.html
Searching file: data/2008-03-17_1040596.html
Searching file: data/2008-03-17_1043639.html
Searching file: data/2008-03-21_1060801.html
Searching file: data/2008-03-24_1331474.html
Searching file: data/2008-03-24_276889.html
Searching file: data/2008-03-25_1096689.html
Searching file: data/2008-03-26_860331.txt
Searching file: data/2008-03-31_725058.html
Searching file: data/2008-04-02_1042351.html
Searching file: data/2008-04-03_1125845.html
Searching file: data/2008-04-04_1060559.html
Searching file: data/2008-04-10_105006.html
Searching file: data/2008-04-10_1256540.html
Searching file: data/2008-04-10_947577.html
Searching file: data/2008-04-14_908254.html
Searching file: data/2

Searching file: data/2009-02-23_931911.html
Searching file: data/2009-03-13_927829.html
Searching file: data/2009-03-24_355637.html
Searching file: data/2009-04-01_1063939.html
Searching file: data/2009-04-06_877645.html
Searching file: data/2009-04-15_1035185.html
Searching file: data/2009-04-17_1351509.html
Searching file: data/2009-05-07_1141719.html
Searching file: data/2009-05-12_1031283.html
Searching file: data/2009-05-14_1361872.html
Searching file: data/2009-05-14_880431.html
Searching file: data/2009-05-26_1348259.html
Searching file: data/2009-05-27_1104855.html
Searching file: data/2009-06-02_1104017.html
Searching file: data/2009-06-02_1163943.html
Searching file: data/2009-06-08_709519.html
Searching file: data/2009-06-15_1300128.html
Searching file: data/2009-06-17_5187.html
Searching file: data/2009-06-19_853273.html
Searching file: data/2009-06-22_1233426.html
Searching file: data/2009-06-23_1269132.html
Searching file: data/2009-06-25_1310243.html
Searching file: data

Searching file: data/2010-05-26_938113.html
Searching file: data/2010-05-28_919628.html
Searching file: data/2010-06-01_821995.html
Searching file: data/2010-06-02_1253710.html
Searching file: data/2010-06-03_900708.html
Searching file: data/2010-06-04_1097136.html
Searching file: data/2010-06-08_737207.html
Searching file: data/2010-06-11_1093434.html
Searching file: data/2010-06-11_934280.html
Searching file: data/2010-06-17_1089473.html
Searching file: data/2010-06-17_1294649.html
Searching file: data/2010-06-18_1339729.html
Searching file: data/2010-06-18_1361579.html
Searching file: data/2010-06-21_1370314.html
Searching file: data/2010-06-22_1093885.html
Searching file: data/2010-06-23_1222497.html
Searching file: data/2010-06-28_1075206.html
Searching file: data/2010-06-28_1344705.html
Searching file: data/2010-06-29_1362120.html
Searching file: data/2010-06-29_823130.html
Searching file: data/2010-06-30_913616.html
Searching file: data/2010-07-01_1100682.html
Searching file: da

Searching file: data/2011-04-15_1280396.html
Searching file: data/2011-04-18_1033032.html
Searching file: data/2011-04-18_1355096.html
Searching file: data/2011-04-18_731131.html
Searching file: data/2011-04-18_742550.html
Searching file: data/2011-04-18_928911.html
Searching file: data/2011-04-22_1344154.html
Searching file: data/2011-04-22_788885.html
Searching file: data/2011-04-26_833081.html
Searching file: data/2011-04-27_94610.html
Searching file: data/2011-04-28_877476.html
Searching file: data/2011-04-29_1086467.html
Searching file: data/2011-04-29_37748.html
Searching file: data/2011-05-02_1003640.html
Searching file: data/2011-05-02_899881.html
Searching file: data/2011-05-02_919722.html
Searching file: data/2011-05-03_944947.html
Searching file: data/2011-05-04_1099932.html
Searching file: data/2011-05-04_92769.html
Searching file: data/2011-05-05_1089044.html
Searching file: data/2011-05-05_1174872.html
Searching file: data/2011-05-05_60751.html
Searching file: data/2011-0

Searching file: data/2012-05-01_77449.html
Searching file: data/2012-05-03_1043533.html
Searching file: data/2012-05-03_1313024.html
Searching file: data/2012-05-04_1059763.html
Searching file: data/2012-05-04_930553.html
Searching file: data/2012-05-08_810829.html
Searching file: data/2012-05-09_1061393.html
Searching file: data/2012-05-16_778946.html
Searching file: data/2012-05-21_1026650.html
Searching file: data/2012-05-21_1103390.html
Searching file: data/2012-05-24_1096788.html
Searching file: data/2012-05-24_1404636.html
Searching file: data/2012-05-25_1043382.html
Searching file: data/2012-05-30_922204.html
Searching file: data/2012-05-31_93384.html
Searching file: data/2012-06-01_1354309.html
Searching file: data/2012-06-06_894738.html
Searching file: data/2012-06-12_793983.html
Searching file: data/2012-06-21_1016470.html
Searching file: data/2012-06-27_41023.html
Searching file: data/2012-06-27_739944.html
Searching file: data/2012-06-29_820237.html
Searching file: data/201

Searching file: data/2013-05-03_64279.html
Searching file: data/2013-05-06_1264707.html
Searching file: data/2013-05-09_1230355.html
Searching file: data/2013-05-09_851249.html
Searching file: data/2013-05-10_1017712.html
Searching file: data/2013-05-10_1293283.html
Searching file: data/2013-05-13_1404296.html
Searching file: data/2013-05-16_1323648.html
Searching file: data/2013-05-17_1183186.html
Searching file: data/2013-05-23_881695.html
Searching file: data/2013-05-24_320387.html
Searching file: data/2013-05-28_1060749.html
Searching file: data/2013-05-28_1277092.html
Searching file: data/2013-05-30_60849.html
Searching file: data/2013-05-31_1514732.html
Searching file: data/2013-05-31_826083.html
Searching file: data/2013-06-04_724267.html
Searching file: data/2013-06-07_1113784.html
Searching file: data/2013-06-10_1274644.html
Searching file: data/2013-06-10_12978.html
Searching file: data/2013-06-10_701345.html
Searching file: data/2013-06-12_1006837.html
Searching file: data/2

Searching file: data/2014-06-18_834162.html
Searching file: data/2014-06-19_1302177.html
Searching file: data/2014-06-19_86144.html
Searching file: data/2014-06-26_37008.html
Searching file: data/2014-06-30_353191.html
Searching file: data/2014-06-30_704159.html
Searching file: data/2014-07-01_44689.html
Searching file: data/2014-07-03_1001907.html
Searching file: data/2014-07-03_1289592.html
Searching file: data/2014-07-03_1326364.html
Searching file: data/2014-07-03_891417.html
Searching file: data/2014-07-07_1510326.html
Searching file: data/2014-07-08_1083712.html
Searching file: data/2014-07-14_891288.html
Searching file: data/2014-07-16_886475.html
Searching file: data/2014-07-24_1575571.html
Searching file: data/2014-07-24_216539.html
Searching file: data/2014-07-24_778734.html
Searching file: data/2014-07-29_1415277.html
Searching file: data/2014-07-29_833444.html
Searching file: data/2014-07-30_1361709.html
Searching file: data/2014-07-31_1379075.html
Searching file: data/2014

Searching file: data/2015-05-12_1485001.html
Searching file: data/2015-05-13_1500711.html
Searching file: data/2015-05-14_1543418.html
Searching file: data/2015-05-14_737300.html
Searching file: data/2015-05-18_1080099.html
Searching file: data/2015-05-18_800240.html
Searching file: data/2015-05-19_1001279.html
Searching file: data/2015-05-21_1159297.html
Searching file: data/2015-05-21_1492633.html
Searching file: data/2015-05-21_230498.html
Searching file: data/2015-05-26_1165880.html
Searching file: data/2015-05-26_1408287.html
Searching file: data/2015-05-28_1304740.html
Searching file: data/2015-05-29_1166220.html
Searching file: data/2015-06-01_1392522.html
Searching file: data/2015-06-01_1478950.html
Searching file: data/2015-06-02_1545158.html
Searching file: data/2015-06-04_1365997.html
Searching file: data/2015-06-05_1123735.html
Searching file: data/2015-06-08_1363851.html
Searching file: data/2015-06-10_1106851.html
Searching file: data/2015-06-10_1576044.html
Searching fil

Searching file: data/2016-03-23_929887.html
Searching file: data/2016-03-25_1546640.html
Searching file: data/2016-03-25_1606657.html
Searching file: data/2016-03-29_225263.html
Searching file: data/2016-03-30_879555.html
Searching file: data/2016-03-31_1603969.html
Searching file: data/2016-03-31_885322.html
Searching file: data/2016-04-01_1622620.html
Searching file: data/2016-04-05_751652.html
Searching file: data/2016-04-07_1501756.html
Searching file: data/2016-04-07_881790.html
Searching file: data/2016-04-07_882154.html
Searching file: data/2016-04-08_215419.html
Searching file: data/2016-04-11_1491675.html
Searching file: data/2016-04-18_1620546.html
Searching file: data/2016-04-20_1507986.html
Searching file: data/2016-04-28_830916.html
Searching file: data/2016-04-29_354869.html
Searching file: data/2016-05-03_32689.html
Searching file: data/2016-05-09_1331745.html
Searching file: data/2016-05-09_1364856.html
Searching file: data/2016-05-09_1485176.html
Searching file: data/2

Searching file: data/2016-12-23_1009976.html
Searching file: data/2016-12-23_1091862.html
Searching file: data/2016-12-23_1383871.html
Searching file: data/2016-12-28_1091171.html
Searching file: data/2016-12-29_1003344.html
Searching file: data/2016-12-29_1179755.html
Searching file: data/2016-12-29_1401521.html
Searching file: data/2016-12-30_863015.html
Searching file: data/2017-01-03_1420783.html
Searching file: data/2017-01-03_1542299.html
Searching file: data/2017-01-04_701811.html
Searching file: data/2017-01-09_1105705.html
Searching file: data/2017-01-12_1574532.html
Searching file: data/2017-01-13_61398.html
Searching file: data/2017-01-17_1514128.html
Searching file: data/2017-01-18_1030206.html
Searching file: data/2017-01-20_20740.html
Searching file: data/2017-01-20_800459.html
Searching file: data/2017-01-24_1042810.html
Searching file: data/2017-01-24_1102541.html
Searching file: data/2017-01-24_845289.html
Searching file: data/2017-01-25_1276520.html
Searching file: da

Searching file: data/2017-10-11_1157377.html
Searching file: data/2017-10-12_1551986.html
Searching file: data/2017-10-12_1588238.html
Searching file: data/2017-10-16_1067837.html
Searching file: data/2017-10-16_1579471.html
Searching file: data/2017-10-17_1599891.html
Searching file: data/2017-10-19_1331875.html
Searching file: data/2017-10-19_1430602.html
Searching file: data/2017-10-19_720002.html
Searching file: data/2017-10-23_1609234.html
Searching file: data/2017-10-25_866121.html
Searching file: data/2017-10-26_1022705.html
Searching file: data/2017-10-30_1583744.html
Searching file: data/2017-10-31_1498415.html
Searching file: data/2017-10-31_870753.html
Searching file: data/2017-11-06_749038.html
Searching file: data/2017-11-07_1085621.html
Searching file: data/2017-11-07_1179500.html
Searching file: data/2017-11-09_777491.html
Searching file: data/2017-11-14_916457.html
Searching file: data/2017-11-16_1180079.html
Searching file: data/2017-11-17_33769.html
Searching file: da

Searching file: data/2018-08-03_1358831.html
Searching file: data/2018-08-03_1404973.html
Searching file: data/2018-08-03_1642081.html
Searching file: data/2018-08-03_1703785.html
Searching file: data/2018-08-03_1735828.html
Searching file: data/2018-08-03_50104.html
Searching file: data/2018-08-06_1374535.html
Searching file: data/2018-08-07_1714973.html
Searching file: data/2018-08-10_356028.html
Searching file: data/2018-08-10_873044.html
Searching file: data/2018-08-13_1302343.html
Searching file: data/2018-08-13_1678531.html
Searching file: data/2018-08-15_933036.html
Searching file: data/2018-08-16_1324410.html
Searching file: data/2018-08-16_1564618.html
Searching file: data/2018-08-17_1028734.html
Searching file: data/2018-08-21_1582966.html
Searching file: data/2018-08-22_1511198.html
Searching file: data/2018-08-23_757011.html
Searching file: data/2018-08-24_733269.html
Searching file: data/2018-08-27_39677.html
Searching file: data/2018-08-28_1040426.html
Searching file: dat

In [9]:
# make summary stats to a dataframe dcfstatsummary
cols = ['year','month','filename', 'dcf_freq','dcf_hit']
dcfstatsummary = pd.DataFrame([], columns=cols)

for y in range(1995,2019):
    lst = []
    for i in dcfstat["year{0}_dcf".format(y)].keys():
        lst.append([i[5:9], i[10:12], i, 
                    len(list(dcfstat["year{0}_dcf".format(y)][i].keys())), 
                    True if len(list(dcfstat["year{0}_dcf".format(y)][i].keys()))>0 else False])
    df = pd.DataFrame(lst, columns=cols)
    dcfstatsummary = pd.concat([dcfstatsummary, df], axis=0)

In [10]:
dcfstatsummary = dcfstatsummary.reset_index(drop=True)
dcfstatsummary

Unnamed: 0,year,month,filename,dcf_freq,dcf_hit
0,1995,02,data/1995-02-10_60026.txt,4,True
1,1995,02,data/1995-02-10_897599.txt,4,True
2,1995,03,data/1995-03-13_63506.txt,3,True
3,1995,03,data/1995-03-16_106015.txt,4,True
4,1995,03,data/1995-03-16_864328.txt,4,True
...,...,...,...,...,...
5028,2018,12,data/2018-12-21_1131096.html,9,True
5029,2018,12,data/2018-12-21_1490349.html,13,True
5030,2018,12,data/2018-12-21_1611110.html,12,True
5031,2018,12,data/2018-12-21_1644963.html,6,True


In [11]:
# summerizing cmp stats for all months from 2008 to 2018
cmpstat = {}
for y in range(1995,2019):
    cmpstat["year{0}_cmp".format(y)] = text_search.find_keyword_in_files(cmp, "data", str(y)+"-01-01", str(y)+"-12-31")

Searching file: data/1995-02-10_60026.txt
Searching file: data/1995-02-10_897599.txt
Searching file: data/1995-03-13_63506.txt
Searching file: data/1995-03-16_106015.txt
Searching file: data/1995-03-16_864328.txt
Searching file: data/1995-03-29_79920.txt
Searching file: data/1995-03-30_920148.txt
Searching file: data/1995-04-04_4960.txt
Searching file: data/1995-04-10_351547.txt
Searching file: data/1995-04-18_726957.txt
Searching file: data/1995-04-25_858339.txt
Searching file: data/1995-04-26_847322.txt
Searching file: data/1995-04-28_814577.txt
Searching file: data/1995-05-04_65660.txt
Searching file: data/1995-05-09_724176.txt
Searching file: data/1995-05-15_319767.txt
Searching file: data/1995-06-02_65771.txt
Searching file: data/1995-06-02_850134.txt
Searching file: data/1995-06-05_701382.txt
Searching file: data/1995-06-14_13006.txt
Searching file: data/1995-06-14_351993.txt
Searching file: data/1995-06-14_775272.txt
Searching file: data/1995-06-28_726606.txt
Searching file: dat

Searching file: data/1996-11-12_949058.txt
Searching file: data/1996-11-14_811714.txt
Searching file: data/1996-11-15_840016.txt
Searching file: data/1996-11-18_846931.txt
Searching file: data/1996-11-19_898623.txt
Searching file: data/1996-11-20_36537.txt
Searching file: data/1996-11-20_37481.txt
Searching file: data/1996-11-20_66895.txt
Searching file: data/1996-11-20_891103.txt
Searching file: data/1996-11-21_837942.txt
Searching file: data/1996-11-22_755659.txt
Searching file: data/1996-11-25_897732.txt
Searching file: data/1996-11-27_32198.txt
Searching file: data/1996-11-27_944740.txt
Searching file: data/1996-11-29_910648.txt
Searching file: data/1996-11-29_929989.txt
Searching file: data/1996-12-03_5513.txt
Searching file: data/1996-12-06_848865.txt
Searching file: data/1996-12-23_929989.txt
Searching file: data/1997-01-10_795879.txt
Searching file: data/1997-01-10_949008.txt
Searching file: data/1997-01-16_885376.txt
Searching file: data/1997-01-21_702147.txt
Searching file: d

Searching file: data/1998-01-05_1007997.txt
Searching file: data/1998-01-05_945429.txt
Searching file: data/1998-01-06_64247.txt
Searching file: data/1998-01-06_879465.txt
Searching file: data/1998-01-06_890096.txt
Searching file: data/1998-01-07_86135.txt
Searching file: data/1998-01-13_828808.txt
Searching file: data/1998-01-13_891103.txt
Searching file: data/1998-01-14_1003110.txt
Searching file: data/1998-01-14_67494.txt
Searching file: data/1998-01-14_841074.txt
Searching file: data/1998-01-14_899647.txt
Searching file: data/1998-01-20_1019254.txt
Searching file: data/1998-01-20_784681.txt
Searching file: data/1998-01-20_79259.txt
Searching file: data/1998-01-21_860731.txt
Searching file: data/1998-01-27_72945.txt
Searching file: data/1998-01-27_886171.txt
Searching file: data/1998-01-27_928451.txt
Searching file: data/1998-01-27_936468.txt
Searching file: data/1998-01-28_107263.txt
Searching file: data/1998-01-28_917062.txt
Searching file: data/1998-01-29_62142.txt
Searching file

Searching file: data/1998-11-09_795090.txt
Searching file: data/1998-11-10_1017362.txt
Searching file: data/1998-11-10_855932.txt
Searching file: data/1998-11-12_1017829.txt
Searching file: data/1998-11-12_1071645.txt
Searching file: data/1998-11-12_311174.txt
Searching file: data/1998-11-12_316554.txt
Searching file: data/1998-11-12_317859.txt
Searching file: data/1998-11-12_700664.txt
Searching file: data/1998-11-12_737876.txt
Searching file: data/1998-11-13_818155.txt
Searching file: data/1998-11-13_925052.txt
Searching file: data/1998-11-17_785557.txt
Searching file: data/1998-11-18_1003007.txt
Searching file: data/1998-11-19_774557.txt
Searching file: data/1998-11-19_782975.txt
Searching file: data/1998-11-19_789292.txt
Searching file: data/1998-11-20_81764.txt
Searching file: data/1998-11-24_775298.txt
Searching file: data/1998-11-25_76696.txt
Searching file: data/1998-11-30_1074404.txt
Searching file: data/1998-12-01_818268.txt
Searching file: data/1998-12-03_861184.txt
Searchin

Searching file: data/1999-08-25_1016718.txt
Searching file: data/1999-08-25_771950.txt
Searching file: data/1999-08-26_1025557.txt
Searching file: data/1999-08-27_842915.txt
Searching file: data/1999-08-31_1061399.txt
Searching file: data/1999-08-31_4310.txt
Searching file: data/1999-09-01_1059259.txt
Searching file: data/1999-09-01_39547.txt
Searching file: data/1999-09-01_890568.txt
Searching file: data/1999-09-03_869446.txt
Searching file: data/1999-09-07_798539.txt
Searching file: data/1999-09-09_851729.txt
Searching file: data/1999-09-10_804073.txt
Searching file: data/1999-09-14_1009532.txt
Searching file: data/1999-09-14_716133.txt
Searching file: data/1999-09-14_78066.txt
Searching file: data/1999-09-16_1006370.txt
Searching file: data/1999-09-16_101911.txt
Searching file: data/1999-09-17_1010856.txt
Searching file: data/1999-09-17_78536.txt
Searching file: data/1999-09-20_1046389.txt
Searching file: data/1999-09-21_1092395.txt
Searching file: data/1999-09-22_949881.txt
Searchi

Searching file: data/2000-04-10_1001802.txt
Searching file: data/2000-04-10_1029456.txt
Searching file: data/2000-04-10_319651.txt
Searching file: data/2000-04-12_785970.txt
Searching file: data/2000-04-12_891285.txt
Searching file: data/2000-04-13_830104.txt
Searching file: data/2000-04-14_18934.txt
Searching file: data/2000-04-14_764403.txt
Searching file: data/2000-04-14_889427.txt
Searching file: data/2000-04-17_1010138.txt
Searching file: data/2000-04-21_861388.txt
Searching file: data/2000-04-25_771729.txt
Searching file: data/2000-04-25_818968.txt
Searching file: data/2000-04-26_102993.txt
Searching file: data/2000-04-26_1030615.txt
Searching file: data/2000-04-26_838171.txt
Searching file: data/2000-04-28_745469.txt
Searching file: data/2000-05-01_1044738.txt
Searching file: data/2000-05-01_793933.txt
Searching file: data/2000-05-02_1038078.txt
Searching file: data/2000-05-02_923284.txt
Searching file: data/2000-05-03_1096841.txt
Searching file: data/2000-05-03_845434.txt
Searc

Searching file: data/2000-12-14_1039242.txt
Searching file: data/2000-12-15_1020898.txt
Searching file: data/2000-12-15_844789.txt
Searching file: data/2000-12-19_706864.txt
Searching file: data/2000-12-19_934884.txt
Searching file: data/2000-12-20_897546.txt
Searching file: data/2000-12-21_1009912.txt
Searching file: data/2000-12-21_1019265.txt
Searching file: data/2000-12-22_776848.txt
Searching file: data/2000-12-26_709335.txt
Searching file: data/2000-12-28_1016169.txt
Searching file: data/2000-12-28_827795.txt
Searching file: data/2000-12-28_904080.txt
Searching file: data/2000-12-29_1085818.txt
Searching file: data/2001-01-02_1094231.txt
Searching file: data/2001-01-02_352683.txt
Searching file: data/2001-01-02_926861.txt
Searching file: data/2001-01-03_1025742.txt
Searching file: data/2001-01-03_947220.txt
Searching file: data/2001-01-04_841939.txt
Searching file: data/2001-01-05_1034898.txt
Searching file: data/2001-01-09_869554.txt
Searching file: data/2001-01-10_1070321.txt
S

Searching file: data/2001-09-25_914642.txt
Searching file: data/2001-09-26_1029093.txt
Searching file: data/2001-09-26_1079270.txt
Searching file: data/2001-09-26_899755.txt
Searching file: data/2001-09-26_902055.txt
Searching file: data/2001-09-27_889409.txt
Searching file: data/2001-09-28_1070042.txt
Searching file: data/2001-10-01_912240.txt
Searching file: data/2001-10-03_1086403.txt
Searching file: data/2001-10-04_1041712.txt
Searching file: data/2001-10-09_895921.txt
Searching file: data/2001-10-09_936372.txt
Searching file: data/2001-10-12_946945.txt
Searching file: data/2001-10-15_889409.txt
Searching file: data/2001-10-17_1063257.txt
Searching file: data/2001-10-17_41850.txt
Searching file: data/2001-10-18_700612.txt
Searching file: data/2001-10-18_835405.txt
Searching file: data/2001-10-19_729365.txt
Searching file: data/2001-10-19_796226.txt
Searching file: data/2001-10-23_1019731.txt
Searching file: data/2001-10-25_1097070.txt
Searching file: data/2001-10-25_919568.txt
Sear

Searching file: data/2002-10-23_9779.html
Searching file: data/2002-10-24_1005758.txt
Searching file: data/2002-10-25_716399.html
Searching file: data/2002-10-28_1113529.txt
Searching file: data/2002-11-01_944950.html
Searching file: data/2002-11-04_75049.txt
Searching file: data/2002-11-05_36270.html
Searching file: data/2002-11-05_943897.html
Searching file: data/2002-11-06_913277.html
Searching file: data/2002-11-06_949301.html
Searching file: data/2002-11-07_745113.txt
Searching file: data/2002-11-08_75049.txt
Searching file: data/2002-11-12_855711.html
Searching file: data/2002-11-15_1005126.html
Searching file: data/2002-11-15_888245.txt
Searching file: data/2002-11-18_1014955.txt
Searching file: data/2002-11-18_1029784.html
Searching file: data/2002-11-20_818999.txt
Searching file: data/2002-11-20_905897.txt
Searching file: data/2002-11-21_1002044.txt
Searching file: data/2002-11-22_942615.html
Searching file: data/2002-11-26_1029688.html
Searching file: data/2002-11-26_897743.t

Searching file: data/2003-11-06_1114617.html
Searching file: data/2003-11-07_1005507.txt
Searching file: data/2003-11-10_1053924.html
Searching file: data/2003-11-10_891504.html
Searching file: data/2003-11-10_914373.html
Searching file: data/2003-11-12_1058539.txt
Searching file: data/2003-11-12_722571.html
Searching file: data/2003-11-13_714603.html
Searching file: data/2003-11-17_1022820.html
Searching file: data/2003-11-18_1047262.html
Searching file: data/2003-11-19_1051825.txt
Searching file: data/2003-11-19_1102752.html
Searching file: data/2003-11-20_1105503.html
Searching file: data/2003-11-20_752692.txt
Searching file: data/2003-11-24_732439.txt
Searching file: data/2003-11-24_889899.txt
Searching file: data/2003-11-26_875354.txt
Searching file: data/2003-11-28_1108520.txt
Searching file: data/2003-12-01_720026.html
Searching file: data/2003-12-02_65270.html
Searching file: data/2003-12-03_1078425.html
Searching file: data/2003-12-04_863441.txt
Searching file: data/2003-12-10

Searching file: data/2004-10-12_921878.html
Searching file: data/2004-10-15_882484.html
Searching file: data/2004-10-21_1071342.html
Searching file: data/2004-10-22_882287.html
Searching file: data/2004-10-26_840824.html
Searching file: data/2004-10-27_898805.html
Searching file: data/2004-10-29_1070764.txt
Searching file: data/2004-11-01_878897.html
Searching file: data/2004-11-08_1095478.html
Searching file: data/2004-11-08_356141.html
Searching file: data/2004-11-10_725549.html
Searching file: data/2004-11-12_1023398.txt
Searching file: data/2004-11-12_1095099.html
Searching file: data/2004-11-12_835412.txt
Searching file: data/2004-11-15_1020905.html
Searching file: data/2004-11-16_1016937.html
Searching file: data/2004-11-16_1023771.html
Searching file: data/2004-11-16_1023772.html
Searching file: data/2004-11-16_1037897.html
Searching file: data/2004-11-16_1217286.html
Searching file: data/2004-11-16_763852.html
Searching file: data/2004-11-16_803747.html
Searching file: data/200

Searching file: data/2005-07-26_896861.html
Searching file: data/2005-07-27_1106842.html
Searching file: data/2005-07-27_94673.html
Searching file: data/2005-08-02_1102556.html
Searching file: data/2005-08-02_723888.html
Searching file: data/2005-08-04_1012131.html
Searching file: data/2005-08-04_1178132.html
Searching file: data/2005-08-08_835664.html
Searching file: data/2005-08-09_878314.html
Searching file: data/2005-08-11_49401.html
Searching file: data/2005-08-12_874689.html
Searching file: data/2005-08-19_779390.html
Searching file: data/2005-08-22_1088005.html
Searching file: data/2005-08-22_830134.html
Searching file: data/2005-08-22_870760.html
Searching file: data/2005-08-24_930796.txt
Searching file: data/2005-08-25_1088244.html
Searching file: data/2005-08-29_750901.html
Searching file: data/2005-08-29_913782.html
Searching file: data/2005-08-30_1061881.html
Searching file: data/2005-08-30_1063537.html
Searching file: data/2005-08-30_1089613.html
Searching file: data/2005-

Searching file: data/2006-05-03_936931.html
Searching file: data/2006-05-05_814929.html
Searching file: data/2006-05-08_790818.txt
Searching file: data/2006-05-08_857323.html
Searching file: data/2006-05-08_879554.html
Searching file: data/2006-05-10_1064236.txt
Searching file: data/2006-05-10_836435.html
Searching file: data/2006-05-11_1171159.html
Searching file: data/2006-05-12_1017022.html
Searching file: data/2006-05-15_1108487.html
Searching file: data/2006-05-16_1014507.html
Searching file: data/2006-05-16_868984.html
Searching file: data/2006-05-18_1123337.html
Searching file: data/2006-05-23_1084827.html
Searching file: data/2006-05-25_1109354.html
Searching file: data/2006-05-26_812152.txt
Searching file: data/2006-05-31_1040660.html
Searching file: data/2006-05-31_1114868.txt
Searching file: data/2006-06-01_874992.html
Searching file: data/2006-06-01_893577.html
Searching file: data/2006-06-01_908440.html
Searching file: data/2006-06-02_732713.html
Searching file: data/2006-

Searching file: data/2006-12-07_731502.html
Searching file: data/2006-12-08_355199.html
Searching file: data/2006-12-08_835910.html
Searching file: data/2006-12-11_1056218.html
Searching file: data/2006-12-11_1325460.html
Searching file: data/2006-12-11_943820.html
Searching file: data/2006-12-13_1145124.html
Searching file: data/2006-12-15_1022329.html
Searching file: data/2006-12-15_1062441.html
Searching file: data/2006-12-15_1325281.html
Searching file: data/2006-12-19_1265131.html
Searching file: data/2006-12-20_1084242.html
Searching file: data/2006-12-20_310433.html
Searching file: data/2006-12-20_873538.html
Searching file: data/2006-12-21_1016152.html
Searching file: data/2006-12-21_745448.html
Searching file: data/2006-12-21_878556.html
Searching file: data/2006-12-22_1015856.html
Searching file: data/2006-12-22_1108271.html
Searching file: data/2006-12-22_1345968.html
Searching file: data/2006-12-22_858558.html
Searching file: data/2006-12-26_801873.html
Searching file: data

Searching file: data/2007-06-22_919869.html
Searching file: data/2007-06-25_1085653.html
Searching file: data/2007-06-26_1001718.html
Searching file: data/2007-06-26_845752.html
Searching file: data/2007-06-26_883587.html
Searching file: data/2007-06-26_883980.html
Searching file: data/2007-06-27_918958.html
Searching file: data/2007-06-28_1087843.txt
Searching file: data/2007-06-29_1295230.html
Searching file: data/2007-06-29_73952.html
Searching file: data/2007-07-02_18568.html
Searching file: data/2007-07-03_1071806.html
Searching file: data/2007-07-03_1289169.html
Searching file: data/2007-07-05_1101215.html
Searching file: data/2007-07-06_23259.html
Searching file: data/2007-07-09_1041652.html
Searching file: data/2007-07-09_883977.html
Searching file: data/2007-07-09_898660.html
Searching file: data/2007-07-09_901495.html
Searching file: data/2007-07-10_1057234.html
Searching file: data/2007-07-10_1325823.html
Searching file: data/2007-07-10_909413.html
Searching file: data/2007-

Searching file: data/2008-01-15_1056904.html
Searching file: data/2008-01-15_1166463.html
Searching file: data/2008-01-16_1174771.html
Searching file: data/2008-01-17_1109354.html
Searching file: data/2008-01-17_864268.html
Searching file: data/2008-01-18_1018074.html
Searching file: data/2008-01-23_1391390.html
Searching file: data/2008-01-23_816949.html
Searching file: data/2008-01-25_1357694.html
Searching file: data/2008-01-25_738076.html
Searching file: data/2008-01-28_721773.html
Searching file: data/2008-01-29_1341769.html
Searching file: data/2008-01-30_1172243.html
Searching file: data/2008-01-31_1336249.html
Searching file: data/2008-02-05_1016831.html
Searching file: data/2008-02-08_1010398.html
Searching file: data/2008-02-08_1112477.txt
Searching file: data/2008-02-11_1094831.html
Searching file: data/2008-02-11_1277998.html
Searching file: data/2008-02-12_795665.html
Searching file: data/2008-02-14_727008.html
Searching file: data/2008-02-15_1085392.txt
Searching file: da

Searching file: data/2008-12-05_895993.html
Searching file: data/2008-12-08_1025134.html
Searching file: data/2008-12-08_1050031.html
Searching file: data/2008-12-08_21212.html
Searching file: data/2008-12-10_1165460.html
Searching file: data/2008-12-11_1098074.html
Searching file: data/2008-12-15_927829.html
Searching file: data/2008-12-17_877902.html
Searching file: data/2008-12-22_1013706.html
Searching file: data/2008-12-22_811830.html
Searching file: data/2008-12-23_1017137.html
Searching file: data/2008-12-24_1086774.html
Searching file: data/2008-12-24_842640.html
Searching file: data/2009-01-05_1001193.html
Searching file: data/2009-01-12_879573.html
Searching file: data/2009-01-15_216324.html
Searching file: data/2009-01-16_1083712.html
Searching file: data/2009-01-16_1302176.html
Searching file: data/2009-01-28_1121225.html
Searching file: data/2009-01-28_1367660.html
Searching file: data/2009-02-03_889949.html
Searching file: data/2009-02-06_944136.html
Searching file: data/

Searching file: data/2010-03-12_1038459.txt
Searching file: data/2010-03-12_940181.html
Searching file: data/2010-03-15_1100969.html
Searching file: data/2010-03-18_845072.html
Searching file: data/2010-03-31_109261.html
Searching file: data/2010-03-31_353567.html
Searching file: data/2010-04-08_1436040.html
Searching file: data/2010-04-16_813634.txt
Searching file: data/2010-04-20_893965.html
Searching file: data/2010-04-21_1041858.html
Searching file: data/2010-04-26_13610.html
Searching file: data/2010-04-27_1042825.html
Searching file: data/2010-04-27_702644.html
Searching file: data/2010-04-28_1295172.html
Searching file: data/2010-04-28_924383.txt
Searching file: data/2010-04-29_1021162.html
Searching file: data/2010-04-29_1358697.html
Searching file: data/2010-04-30_1377720.html
Searching file: data/2010-04-30_66479.html
Searching file: data/2010-05-03_1288750.txt
Searching file: data/2010-05-07_1123871.html
Searching file: data/2010-05-07_1392179.html
Searching file: data/2010-

Searching file: data/2011-02-10_1050690.html
Searching file: data/2011-02-11_1140486.html
Searching file: data/2011-02-14_872821.html
Searching file: data/2011-02-17_34151.html
Searching file: data/2011-02-18_726845.html
Searching file: data/2011-02-23_709942.html
Searching file: data/2011-02-28_1283073.html
Searching file: data/2011-03-01_726845.html
Searching file: data/2011-03-09_1084876.html
Searching file: data/2011-03-09_746834.html
Searching file: data/2011-03-11_1041326.html
Searching file: data/2011-03-15_1313918.html
Searching file: data/2011-03-16_1069353.html
Searching file: data/2011-03-17_1082368.html
Searching file: data/2011-03-17_1082564.html
Searching file: data/2011-03-18_1172852.html
Searching file: data/2011-03-18_857957.html
Searching file: data/2011-03-23_1084750.html
Searching file: data/2011-03-25_1397533.html
Searching file: data/2011-04-01_1002531.html
Searching file: data/2011-04-01_1073149.html
Searching file: data/2011-04-01_1335190.html
Searching file: da

Searching file: data/2012-01-17_847384.html
Searching file: data/2012-01-17_859905.html
Searching file: data/2012-01-17_859906.html
Searching file: data/2012-01-17_859911.html
Searching file: data/2012-01-25_1088787.html
Searching file: data/2012-01-31_1066107.html
Searching file: data/2012-02-03_107681.html
Searching file: data/2012-02-06_1332896.html
Searching file: data/2012-02-07_42542.html
Searching file: data/2012-02-08_1282398.html
Searching file: data/2012-02-08_315293.html
Searching file: data/2012-02-16_805792.html
Searching file: data/2012-02-16_813658.html
Searching file: data/2012-02-21_859139.html
Searching file: data/2012-03-06_1134203.html
Searching file: data/2012-03-06_802356.html
Searching file: data/2012-03-09_85408.html
Searching file: data/2012-03-13_1156295.html
Searching file: data/2012-03-22_792013.html
Searching file: data/2012-03-23_1299704.html
Searching file: data/2012-03-23_885542.html
Searching file: data/2012-03-30_97854.html
Searching file: data/2012-04

Searching file: data/2013-03-15_1393744.html
Searching file: data/2013-03-15_1425597.html
Searching file: data/2013-03-18_1518749.html
Searching file: data/2013-03-21_1397821.html
Searching file: data/2013-03-21_1487999.html
Searching file: data/2013-03-22_1036478.html
Searching file: data/2013-03-25_912145.html
Searching file: data/2013-03-26_1042821.html
Searching file: data/2013-03-27_1336691.html
Searching file: data/2013-03-27_46640.html
Searching file: data/2013-03-29_1297067.html
Searching file: data/2013-04-01_351017.html
Searching file: data/2013-04-02_1277151.html
Searching file: data/2013-04-02_1277406.html
Searching file: data/2013-04-08_929994.html
Searching file: data/2013-04-09_350077.html
Searching file: data/2013-04-10_1078383.html
Searching file: data/2013-04-11_1419178.html
Searching file: data/2013-04-11_760326.html
Searching file: data/2013-04-17_20232.html
Searching file: data/2013-04-17_828678.html
Searching file: data/2013-04-18_891456.html
Searching file: data/

Searching file: data/2014-04-11_1413440.html
Searching file: data/2014-04-15_1029023.html
Searching file: data/2014-04-16_68145.html
Searching file: data/2014-04-21_1343719.html
Searching file: data/2014-04-21_750199.html
Searching file: data/2014-04-24_934538.html
Searching file: data/2014-05-01_109156.html
Searching file: data/2014-05-01_1271625.html
Searching file: data/2014-05-02_1398551.html
Searching file: data/2014-05-02_93631.html
Searching file: data/2014-05-06_1578845.html
Searching file: data/2014-05-06_38074.html
Searching file: data/2014-05-09_1310157.html
Searching file: data/2014-05-14_1143921.html
Searching file: data/2014-05-14_6951.html
Searching file: data/2014-05-15_1001871.html
Searching file: data/2014-05-20_1561680.html
Searching file: data/2014-05-21_730716.html
Searching file: data/2014-05-23_775345.html
Searching file: data/2014-05-27_892482.html
Searching file: data/2014-05-28_82473.html
Searching file: data/2014-05-30_97472.html
Searching file: data/2014-06-

Searching file: data/2015-03-20_1575988.html
Searching file: data/2015-03-24_1311596.html
Searching file: data/2015-03-24_1338613.html
Searching file: data/2015-03-25_869986.html
Searching file: data/2015-03-25_949156.html
Searching file: data/2015-03-26_354707.html
Searching file: data/2015-03-27_1588869.html
Searching file: data/2015-04-01_1070154.html
Searching file: data/2015-04-01_722256.html
Searching file: data/2015-04-03_1173752.html
Searching file: data/2015-04-08_1292556.html
Searching file: data/2015-04-08_1590584.html
Searching file: data/2015-04-10_1274057.html
Searching file: data/2015-04-10_1394159.html
Searching file: data/2015-04-16_921590.html
Searching file: data/2015-04-17_1111335.html
Searching file: data/2015-04-22_201461.html
Searching file: data/2015-04-24_1524471.html
Searching file: data/2015-04-27_854701.html
Searching file: data/2015-04-30_1076195.html
Searching file: data/2015-05-01_1256069.html
Searching file: data/2015-05-01_1493491.html
Searching file: d

Searching file: data/2016-01-15_1327471.html
Searching file: data/2016-01-19_54441.html
Searching file: data/2016-01-22_1548981.html
Searching file: data/2016-01-22_1622893.html
Searching file: data/2016-01-22_804212.html
Searching file: data/2016-01-25_1607716.html
Searching file: data/2016-01-27_1121793.html
Searching file: data/2016-01-28_1320947.html
Searching file: data/2016-02-01_1609132.html
Searching file: data/2016-02-04_1051741.html
Searching file: data/2016-02-04_1062438.html
Searching file: data/2016-02-04_1071264.html
Searching file: data/2016-02-05_1000180.html
Searching file: data/2016-02-16_356213.html
Searching file: data/2016-02-17_928421.html
Searching file: data/2016-02-18_316206.html
Searching file: data/2016-02-18_923284.html
Searching file: data/2016-02-19_843368.html
Searching file: data/2016-02-24_913077.html
Searching file: data/2016-02-26_872448.html
Searching file: data/2016-03-08_1486800.html
Searching file: data/2016-03-09_1001606.html
Searching file: data

Searching file: data/2016-10-31_889936.html
Searching file: data/2016-11-02_1308711.html
Searching file: data/2016-11-07_1507934.html
Searching file: data/2016-11-09_1609865.html
Searching file: data/2016-11-09_921768.html
Searching file: data/2016-11-10_1110783.html
Searching file: data/2016-11-10_1330622.html
Searching file: data/2016-11-14_1373835.html
Searching file: data/2016-11-14_1499268.html
Searching file: data/2016-11-16_1578776.html
Searching file: data/2016-11-18_1273801.html
Searching file: data/2016-11-18_1467076.html
Searching file: data/2016-11-18_1597503.html
Searching file: data/2016-11-23_1608298.html
Searching file: data/2016-11-23_1615063.html
Searching file: data/2016-11-23_43350.html
Searching file: data/2016-11-28_1622577.html
Searching file: data/2016-11-29_1056923.html
Searching file: data/2016-11-29_805993.html
Searching file: data/2016-12-05_1022652.html
Searching file: data/2016-12-08_1034054.html
Searching file: data/2016-12-09_1560385.html
Searching file:

Searching file: data/2017-08-31_1530804.html
Searching file: data/2017-09-01_1500836.html
Searching file: data/2017-09-05_357020.html
Searching file: data/2017-09-06_1027183.html
Searching file: data/2017-09-06_1599947.html
Searching file: data/2017-09-06_726513.html
Searching file: data/2017-09-11_1476264.html
Searching file: data/2017-09-11_1610793.html
Searching file: data/2017-09-12_1507277.html
Searching file: data/2017-09-12_764038.html
Searching file: data/2017-09-12_914374.html
Searching file: data/2017-09-13_1408100.html
Searching file: data/2017-09-14_1545654.html
Searching file: data/2017-09-15_1123270.html
Searching file: data/2017-09-15_1143155.html
Searching file: data/2017-09-18_1611988.html
Searching file: data/2017-09-20_1017793.html
Searching file: data/2017-09-22_1105472.html
Searching file: data/2017-09-22_1333170.html
Searching file: data/2017-09-22_1469822.html
Searching file: data/2017-09-28_1581908.html
Searching file: data/2017-10-02_104918.html
Searching file:

Searching file: data/2018-06-27_1297587.html
Searching file: data/2018-06-27_1496048.html
Searching file: data/2018-06-27_881890.html
Searching file: data/2018-06-28_1001039.html
Searching file: data/2018-06-28_1308161.html
Searching file: data/2018-07-02_1698990.html
Searching file: data/2018-07-05_1653558.html
Searching file: data/2018-07-11_1170991.html
Searching file: data/2018-07-11_1404079.html
Searching file: data/2018-07-12_1483096.html
Searching file: data/2018-07-16_1096385.html
Searching file: data/2018-07-16_1532063.html
Searching file: data/2018-07-16_701221.html
Searching file: data/2018-07-18_1478726.html
Searching file: data/2018-07-23_1657197.html
Searching file: data/2018-07-24_109380.html
Searching file: data/2018-07-24_1434620.html
Searching file: data/2018-07-25_1497275.html
Searching file: data/2018-07-25_1617898.html
Searching file: data/2018-07-26_1676479.html
Searching file: data/2018-07-27_1036848.html
Searching file: data/2018-07-27_1431766.html
Searching fil

In [12]:
# make summary stats to a dataframe cmpstatsummary
cols = ['year','month','filename', 'cmp_freq','cmp_hit']
cmpstatsummary = pd.DataFrame([], columns=cols)

for y in range(1995,2019):
    lst = []
    for i in cmpstat["year{0}_cmp".format(y)].keys():
        lst.append([i[5:9], i[10:12], i, 
                    len(list(cmpstat["year{0}_cmp".format(y)][i].keys())), 
                    True if len(list(cmpstat["year{0}_cmp".format(y)][i].keys()))>0 else False])
    df = pd.DataFrame(lst, columns=cols)
    cmpstatsummary = pd.concat([cmpstatsummary, df], axis=0)

In [13]:
cmpstatsummary = cmpstatsummary.reset_index(drop=True)
cmpstatsummary

Unnamed: 0,year,month,filename,cmp_freq,cmp_hit
0,1995,02,data/1995-02-10_60026.txt,3,True
1,1995,02,data/1995-02-10_897599.txt,3,True
2,1995,03,data/1995-03-13_63506.txt,4,True
3,1995,03,data/1995-03-16_106015.txt,3,True
4,1995,03,data/1995-03-16_864328.txt,3,True
...,...,...,...,...,...
5028,2018,12,data/2018-12-21_1131096.html,5,True
5029,2018,12,data/2018-12-21_1490349.html,20,True
5030,2018,12,data/2018-12-21_1611110.html,20,True
5031,2018,12,data/2018-12-21_1644963.html,1,True


In [14]:
dcfstatsummary

Unnamed: 0,year,month,filename,dcf_freq,dcf_hit
0,1995,02,data/1995-02-10_60026.txt,4,True
1,1995,02,data/1995-02-10_897599.txt,4,True
2,1995,03,data/1995-03-13_63506.txt,3,True
3,1995,03,data/1995-03-16_106015.txt,4,True
4,1995,03,data/1995-03-16_864328.txt,4,True
...,...,...,...,...,...
5028,2018,12,data/2018-12-21_1131096.html,9,True
5029,2018,12,data/2018-12-21_1490349.html,13,True
5030,2018,12,data/2018-12-21_1611110.html,12,True
5031,2018,12,data/2018-12-21_1644963.html,6,True


In [15]:
# combine dcpstatsummary and cmpstatsummary
diff_cols = cmpstatsummary[['cmp_freq','cmp_hit']]
statsummary = pd.merge(dcfstatsummary, diff_cols, left_index=True, right_index=True, how='inner')
statsummary

Unnamed: 0,year,month,filename,dcf_freq,dcf_hit,cmp_freq,cmp_hit
0,1995,02,data/1995-02-10_60026.txt,4,True,3,True
1,1995,02,data/1995-02-10_897599.txt,4,True,3,True
2,1995,03,data/1995-03-13_63506.txt,3,True,4,True
3,1995,03,data/1995-03-16_106015.txt,4,True,3,True
4,1995,03,data/1995-03-16_864328.txt,4,True,3,True
...,...,...,...,...,...,...,...
5028,2018,12,data/2018-12-21_1131096.html,9,True,5,True
5029,2018,12,data/2018-12-21_1490349.html,13,True,20,True
5030,2018,12,data/2018-12-21_1611110.html,12,True,20,True
5031,2018,12,data/2018-12-21_1644963.html,6,True,1,True


In [16]:
# export statsummary to csv
statsummary.to_csv('statsummary.csv')