In [33]:
# Standard libraries
import pandas as pd
import numpy as np
from scipy import stats
# Visualization
import matplotlib.pyplot as plt
import datetime
import os 

# os.system("pip install wrds") # TODO: Probably put this in utils.py
import wrds
# os.system("pip install pandas-datareader")
import pandas_datareader.data as web

# os.system("pip install seaborn")
import seaborn as sns

# Note we don't actually need pandas_datareader. Could import/use YahooFinanceAPI which gives same info

<b>TODO LIST:</b>
1. ~~Parse audit analytics data for year and month of date of disclosure (possibly include date of breach) - Might have to select a couple of firms that we want to analyze?~~
    * ~~Use year, month, and ticker to find stock price of company for that month (in WorldScope database)~~
    * ~~Find stock price 1 to 12 months from that date (WorldScope). Calculate gain/loss in stock price from original. Potentially even take stock price the month before the breach?~~
2. Perform statistical tests + data visualization/analysis. Some examples:
    * Regression on multiple variables to predict stock price gain/loss over x months 
    * Scatter plot over time (ex: 2000-2021) of stock price gains/losses
    * Scatter plot over time after date of disclosure/breach of stock price gains/losses
    * Use Regression to see if type of breach matters for stock price
    * Regression to predict records lost?
    * Create decision tree to see type of breach

<b>Problems to maybe deal with:</b> <br>
Do we combine audit analytics dataset with another one (PRC? Gordon said the PRC data is more erroneous than audit analytics, audit analytics more trusted) - make sure no repeats

<h2>Data Collection and Processing</h2> 

In [2]:
db = wrds.Connection()

Enter your WRDS username [gabri]:gnaval
Enter your password:········
WRDS recommends setting up a .pgpass file.
You can find more info here:
https://www.postgresql.org/docs/9.5/static/libpq-pgpass.html.
Loading library list...
Done


In [66]:
# PRC Dataset 
PRC_df = pd.read_csv("../data/prc.csv" )

# Data Cleaning
PRC_df.drop(PRC_df.columns[[13,14,15]], axis=1, inplace = True)
PRC_df = PRC_df[PRC_df['Total Records'].notna()]

def records_to_int(record_str):
    return int(str(record_str).replace(",", ""))

PRC_df['Total Records'] = PRC_df['Total Records'].apply(records_to_int)

# Drop rows with 0 total records (Maybe toggle this off we want to fill in data)
PRC_df = PRC_df[PRC_df['Total Records'] != 0]

# Audit Analytics Dataset
xls = pd.ExcelFile('../data/audit_analytics.xlsx')
aa_records_df = pd.read_excel(xls, 'PublicCyber')
aa_ddos_df = pd.read_excel(xls, 'DDoS')

# Get rid of rows with no tickers (these might be non-publicly traded)
aa_records_df = aa_records_df[aa_records_df['Ticker'].isna() != True].reset_index(drop=True)
table_columns = ['Company name', 'Ticker', 'Date of Breach', 'Date Became Aware of Breach', 'Date of Disclosure',
                 'Number of Records', 'Type of Info', 'Information', 'Attack']
aa_records_df = aa_records_df[aa_records_df.columns.intersection(table_columns)]
display(aa_records_df)

# Ticker - Don't think we need this 
ticker_df = pd.read_csv("../data/ticker.csv" )

Unnamed: 0,Company name,Ticker,Date of Breach,Date Became Aware of Breach,Date of Disclosure,Number of Records,Type of Info,Information,Attack
0,HUMANA INC,HUM,10/12/2020 - 12/16/2020,2020-12-22 00:00:00,2021-02-25,,Personal,SSN | Name | DoB | Address | Phone Number | Em...,Unauthorized Access
1,HERBALIFE NUTRITION LTD.,HLF,2020-10-01 00:00:00,,2021-02-23,,Personal | Other,Name | Phone Number | Address | DOB | Membersh...,Unauthorized Access
2,"Ultra Clean Holdings, Inc.",UCTT,,,2021-02-23,,Personal,SSN | Driver's License,ND
3,KROGER CO,KR,,2021-01-23 00:00:00,2021-02-19,,Personal,Name | Email | Phone Number | Address | DOB | ...,Unauthorized Access
4,CINTAS CORP,CTAS,9/16/2020 - 9/20/2020,2020-09-20 00:00:00,2021-02-15,,Financial | Personal,Name | Bank Account,Unauthorized Access
...,...,...,...,...,...,...,...,...,...
732,COSTCO WHOLESALE CORP /NEW,COST,6/19/2014 - 7/15/2015,,2015-09-23,815,Financial,Credit Card | Name | Address | Phone Number | ...,ND
733,RITE AID CORP,RAD,8/20/2014 - 7/14/2015,2015-08-17 00:00:00,2015-09-22,,Financial,Credit Card | Name | Address | Phone Number | ...,ND
734,CVS HEALTH Corp,CVS,6/19/2014 - 7/14/2015,2015-08-13 00:00:00,2015-09-11,,Financial,Credit Card | Name | Address | Phone Number | ...,ND
735,WEYERHAEUSER CO,WY,May-2014 - 1/29/2015,2015-01-29 00:00:00,2015-03-17,11000000,Personal,Name | DoB | Address | Email | SSN | Other,Phishing


In [4]:
today = datetime.datetime.today().date()

def nearest(items, pivot):
    """
    Gets closest day in a set (used to obtain stock price y months after disclosure)
    """
    return min(items, key=lambda x: abs((x - pivot).days))

def stock_after_disclosure(row, num_months):
    """
    Returns an array containing the monthly stock price of a firm after date of disclosure (0 - num_months months after breach).
    If firm exists in YahooFinance database, but no stock price available for a month (either b/c that date has yet to occur or b/c simply N/A),
    returns np.nan.
    If firm does not exist in YahooFinance database, return array of np.nan's.
    
    Parameters: 
    row : Dataframe row
        Input dataframe's row (used along with df.apply)
    num_months : int
        Month limit
    """
    start = pd.to_datetime(row['Date of Disclosure']) 
    end = start + pd.DateOffset(months = num_months)
    # Don't know if i should include this, check stock day before breach to control for large stock dip when breach is disclosed
    start -= datetime.timedelta(days=1) 
    #print(row['Ticker'])
    try:
        df = web.DataReader(row['Ticker'], 'yahoo', start, end)
        #display(df)
        lst = []
        for month in range(0, num_months + 1):
            date = nearest(df.index, (start + pd.DateOffset(months = month)))
            if today <= date.date():
                for x in range(month, num_months + 1):
                    lst.append(np.nan)
                break
            lst.append(df.loc[date]["Close"])
        return lst
    except Exception as e:
        print("Error at %s" % row['Ticker'])
        print(repr(e))
        return [np.nan] * (num_months + 1)

In [67]:
lst = []
months_after = 12 #Toggle this value
col = []
for i in range(0, months_after + 1):
    col.append("Stock Price (%s months DoD)" % i)

In [6]:
# Create array of arrays that contains stock prices after date of disclosure for each breach
for index, row in aa_records_df.iterrows():
#     print("%s: %s" %(index, row['Ticker']))
    x = stock_after_disclosure(row, months_after)
    lst.append(x)

0: HUM
1: HLF
2: UCTT
3: KR
4: CTAS
5: BKEP
6: SJI
7: GSAT
8: WRK
9: CVNA
10: INDB
11: FWRD
12: TSLA
13: INTC
14: PRTS
15: USM
16: JNJ
17: WOOF
18: NWLI
19: TD
20: WMT
21: INTC
22: CLAR
23: BNTX
24: PFE
25: UPS
26: NSC
27: SFT
28: UI
29: MIME
30: VBTX
31: ACB
32: MATX
33: TMUS
34: WHR
35: CFX
36: VMW
37: NVDA
38: INTC
39: CSCO
40: MSFT
41: MD
42: CLH
43: SWI
44: SPOT
45: FEYE
46: IIVI
47: BRK.A
Error at BRK.A
KeyError('Date')
48: MTX
49: BDC
50: CAJ
51: MANU
52: SPLP
53: VIAC
54: SEAC
55: FWRD
56: SEGR
Error at SEGR
KeyError('Date')
57: GEO
58: SITE
59: HD
60: UCTT
61: RDY
62: BRID
63: PFE
64: TMX
65: TMUS
66: DXPE
67: UHS
68: SBGI
69: UNM
70: UHS
71: VOXX
72: CLUBQ
73: TYL
74: RDNT
75: VCNX
76: EQIX
77: FLNT
78: WMG
79: AMPH
80: RCM
81: MAR
82: CCL
83: BF.B
Error at BF.B
KeyError('Date')
84: SRCL
85: INTC
86: CAJ
87: DVAX
88: GRMN
89: BLKB
90: TWTR
91: FORM
92: SPLP
93: TWTR
94: ULH
95: MXL
96: HMC
97: CNDT
98: CZZ
99: IEP
100: GNW
101: MGLN
102: KR
103: LOV
104: GDDY
105: CHGG
106: P

In [7]:
# Merge stock price after breach with original dataframe
stock_prices = pd.DataFrame(lst, columns = col)
stock_price_aa_records = pd.concat([aa_records_df, stock_prices], axis=1, join='inner')
display(stock_price_aa_records)

Unnamed: 0,Company name,Related party,Target relationship to Parent,CIK,Ticker,Market,IRS Number,Parent CIK,Parent Name,Bus Street 1,...,Stock Price (3 months DoD),Stock Price (4 months DoD),Stock Price (5 months DoD),Stock Price (6 months DoD),Stock Price (7 months DoD),Stock Price (8 months DoD),Stock Price (9 months DoD),Stock Price (10 months DoD),Stock Price (11 months DoD),Stock Price (12 months DoD)
0,HUMANA INC,HUMANA INC,Parent,49071,HUM,NYSE,61-0647538,,,500 W MAIN ST,...,,,,,,,,,,
1,HERBALIFE NUTRITION LTD.,Herbalife Nutrition,Parent,1180262,HLF,NYSE,98-0377871,,,P.O. BOX 309GT,...,,,,,,,,,,
2,"Ultra Clean Holdings, Inc.","Ultra Clean Holdings, Inc.",Parent,1275014,UCTT,Nasdaq Global Market,61-1430858,,,150 INDEPENDENCE DRIVE,...,,,,,,,,,,
3,KROGER CO,Kroger Pharmacy,Parent,56873,KR,NYSE,31-0345740,,,1014 VINE ST,...,,,,,,,,,,
4,CINTAS CORP,Cintas Corporation,Parent,723254,CTAS,Nasdaq Global Market,31-1188630,,,6800 CINTAS BLVD,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,COSTCO WHOLESALE CORP /NEW,Costco Photo Center; PNI Digital Media,Third-party,909832,COST,Nasdaq Global Market,91-1223280,,,999 LAKE DRIVE,...,161.229996,153.690002,150.059998,152.229996,149.919998,142.009995,158.119995,167.470001,167.940002,153.149994
733,RITE AID CORP,Rite Aid Pharmacy; PNI Digital Media,Third-party,84129,RAD,NYSE,23-1614034,,,30 HUNTER LANE,...,157.399994,154.199997,159.000000,159.800003,160.800003,153.199997,154.800003,137.199997,154.600006,162.000000
734,CVS HEALTH Corp,CVSphoto.com; PNI Digital Media,Third-party,64803,CVS,NYSE,05-0494040,,,ONE CVS DR.,...,94.739998,94.919998,93.110001,99.339996,100.769997,106.099998,96.669998,96.839996,97.110001,91.519997
735,WEYERHAEUSER CO,Premera Blue Cross,Third-party,106535,WY,NYSE,91-0470860,,,220 OCCIDENTAL AVENUE SOUTH,...,32.959999,30.959999,31.080000,28.450001,29.420000,30.139999,30.870001,25.820000,23.770000,29.450001


In [15]:
def analyst_stock_price(row):
    """
    Returns the median and mean of analyst stock price forecasts for a firm, where the forecasts are within a month after the beach. 
    These forecasts predict the stock price 12 months into the future.
    
    Parameters
        row - Dataframe row
        Input dataframe's row (used along with df.apply)
    Returns
        List of length 2. [median, mean]
    """
    date = pd.to_datetime(row['Date of Disclosure'])
    
    sql_query="""
    SELECT VALUE as stock_price
    FROM ibes.ptgdet
    WHERE OFTIC ='{}' AND CAST(HORIZON as int) = 12 AND ANNDATS BETWEEN '{}' and '{}'
    """.format(row['Ticker'], date, date + pd.DateOffset(months = 1))
    
    df = db.raw_sql(sql_query)
    
    if len(df.index) == 0:
        return [np.nan] * 2
    return [df['stock_price'].median(), df['stock_price'].mean()]
    

In [21]:
# Create array of arrays that contains stock prices after date of disclosure for each breach
lst = []
for index, row in stock_price_aa_records.iterrows():
    lst.append(analyst_stock_price(row))
    
# Merge stock price after breach with original dataframe
median_mean_df = pd.DataFrame(lst, columns = ['median stock forecast', 'mean stock forecast'])
stock_price_aa_records = pd.concat([stock_price_aa_records, median_mean_df], axis=1, join='inner')
display(stock_price_aa_records)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Company name,Related party,Target relationship to Parent,CIK,Ticker,Market,IRS Number,Parent CIK,...,Stock Price (5 months DoD),Stock Price (6 months DoD),Stock Price (7 months DoD),Stock Price (8 months DoD),Stock Price (9 months DoD),Stock Price (10 months DoD),Stock Price (11 months DoD),Stock Price (12 months DoD),median stock forecast,mean stock forecast
0,0,0,HUMANA INC,HUMANA INC,Parent,49071,HUM,NYSE,61-0647538,,...,,,,,,,,,,
1,1,1,HERBALIFE NUTRITION LTD.,Herbalife Nutrition,Parent,1180262,HLF,NYSE,98-0377871,,...,,,,,,,,,,
2,2,2,"Ultra Clean Holdings, Inc.","Ultra Clean Holdings, Inc.",Parent,1275014,UCTT,Nasdaq Global Market,61-1430858,,...,,,,,,,,,,
3,3,3,KROGER CO,Kroger Pharmacy,Parent,56873,KR,NYSE,31-0345740,,...,,,,,,,,,,
4,4,4,CINTAS CORP,Cintas Corporation,Parent,723254,CTAS,Nasdaq Global Market,31-1188630,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,732,732,COSTCO WHOLESALE CORP /NEW,Costco Photo Center; PNI Digital Media,Third-party,909832,COST,Nasdaq Global Market,91-1223280,,...,150.059998,152.229996,149.919998,142.009995,158.119995,167.470001,167.940002,153.149994,157.5,158.416667
733,733,733,RITE AID CORP,Rite Aid Pharmacy; PNI Digital Media,Third-party,84129,RAD,NYSE,23-1614034,,...,159.000000,159.800003,160.800003,153.199997,154.800003,137.199997,154.600006,162.000000,170.0,170.000000
734,734,734,CVS HEALTH Corp,CVSphoto.com; PNI Digital Media,Third-party,64803,CVS,NYSE,05-0494040,,...,93.110001,99.339996,100.769997,106.099998,96.669998,96.839996,97.110001,91.519997,113.5,113.500000
735,735,735,WEYERHAEUSER CO,Premera Blue Cross,Third-party,106535,WY,NYSE,91-0470860,,...,31.080000,28.450001,29.420000,30.139999,30.870001,25.820000,23.770000,29.450001,38.0,38.250000


In [77]:
# Store new dataframe as csv for easy load-in/testing
stock_price_aa_records.to_csv("../data/stock_price_aa_records.csv", index = False)

In [88]:
#### CURRENT WORK FRAME
# Load in dataframe
stock_price_aa_records = pd.read_csv("../data/stock_price_aa_records.csv")

In [89]:
# Cleaning and Dummy encoding
stock_price_aa_records['Type of Info'] = stock_price_aa_records['Type of Info'].str.replace(" ", "")
stock_price_aa_records['Attack'] = stock_price_aa_records['Attack'].str.replace("; ", "|")
stock_price_aa_records = pd.concat([stock_price_aa_records.drop('Type of Info', 1), stock_price_aa_records['Type of Info'].str.get_dummies(sep="|").add_suffix(" (Type of Info)")], 1)
stock_price_aa_records = pd.concat([stock_price_aa_records.drop('Attack', 1), stock_price_aa_records['Attack'].str.get_dummies(sep="|").add_suffix(" (Attack)")], 1)

Unnamed: 0,Company name,Ticker,Date of Breach,Date Became Aware of Breach,Date of Disclosure,Number of Records,Information,Stock Price (0 months DoD),Stock Price (1 months DoD),Stock Price (2 months DoD),Stock Price (3 months DoD),Stock Price (4 months DoD),Stock Price (5 months DoD),Stock Price (6 months DoD),Stock Price (7 months DoD),Stock Price (8 months DoD),Stock Price (9 months DoD),Stock Price (10 months DoD),Stock Price (11 months DoD),Stock Price (12 months DoD),median stock forecast,mean stock forecast,Financial (Type of Info),Intrusion (Type of Info),ND (Type of Info),Other (Type of Info),Personal (Type of Info),Exploit (Attack),Malware (Attack),Misconfiguration (Attack),ND (Attack),Phishing (Attack),Ransomware (Attack),SQLi (Attack),Spoofing (Attack),Unauthorized Access (Attack),Virus (Attack)
0,HUMANA INC,HUM,10/12/2020 - 12/16/2020,2020-12-22 00:00:00,2021-02-25,,SSN | Name | DoB | Address | Phone Number | Em...,381.399994,414.660004,,,,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,HERBALIFE NUTRITION LTD.,HLF,2020-10-01 00:00:00,,2021-02-23,,Name | Phone Number | Address | DOB | Membersh...,45.279999,46.299999,,,,,,,,,,,,,,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0
2,"Ultra Clean Holdings, Inc.",UCTT,,,2021-02-23,,SSN | Driver's License,48.369999,55.090000,,,,,,,,,,,,,,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0
3,KROGER CO,KR,,2021-01-23 00:00:00,2021-02-19,,Name | Email | Phone Number | Address | DOB | ...,33.840000,34.689999,,,,,,,,,,,,,,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
4,CINTAS CORP,CTAS,9/16/2020 - 9/20/2020,2020-09-20 00:00:00,2021-02-15,,Name | Bank Account,345.100006,351.600006,,,,,,,,,,,,,,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,COSTCO WHOLESALE CORP /NEW,COST,6/19/2014 - 7/15/2015,,2015-09-23,815,Credit Card | Name | Address | Phone Number | ...,142.649994,158.149994,163.449997,161.229996,153.690002,150.059998,152.229996,149.919998,142.009995,158.119995,167.470001,167.940002,153.149994,157.5,158.416667,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
733,RITE AID CORP,RAD,8/20/2014 - 7/14/2015,2015-08-17 00:00:00,2015-09-22,,Credit Card | Name | Address | Phone Number | ...,146.600006,124.400002,154.199997,157.399994,154.199997,159.000000,159.800003,160.800003,153.199997,154.800003,137.199997,154.600006,162.000000,170.0,170.000000,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
734,CVS HEALTH Corp,CVS,6/19/2014 - 7/14/2015,2015-08-13 00:00:00,2015-09-11,,Credit Card | Name | Address | Phone Number | ...,100.800003,102.000000,97.839996,94.739998,94.919998,93.110001,99.339996,100.769997,106.099998,96.669998,96.839996,97.110001,91.519997,113.5,113.500000,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
735,WEYERHAEUSER CO,WY,May-2014 - 1/29/2015,2015-01-29 00:00:00,2015-03-17,11000000,Name | DoB | Address | Email | SSN | Other,33.680000,32.150002,32.340000,32.959999,30.959999,31.080000,28.450001,29.420000,30.139999,30.870001,25.820000,23.770000,29.450001,38.0,38.250000,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0


In [None]:
"""
It turns out there are A LOT of different types of information released (around ~70) 
and this will probably be correlated to the type of info, so let's just not include this for now
"""
# stock_price_aa_records['Information'] = stock_price_aa_records['Information'].str.replace(" ", "")
# stock_price_aa_records = pd.concat([stock_price_aa_records.drop('Information', 1), stock_price_aa_records['Information'].str.get_dummies(sep="|").add_suffix(" (Information)")], 1)

In [90]:
# Avoiding the dummy variable trap [https://www.wikiwand.com/en/Dummy_variable_(statistics)]
# Drop ND (Not Disclosed) columns for 'Attack' and 'Type of Info'
stock_price_aa_records.drop(columns = ['ND (Type of Info)', 'ND (Attack)'], inplace = True)
display(stock_price_aa_records)

Unnamed: 0,Company name,Ticker,Date of Breach,Date Became Aware of Breach,Date of Disclosure,Number of Records,Information,Stock Price (0 months DoD),Stock Price (1 months DoD),Stock Price (2 months DoD),Stock Price (3 months DoD),Stock Price (4 months DoD),Stock Price (5 months DoD),Stock Price (6 months DoD),Stock Price (7 months DoD),Stock Price (8 months DoD),Stock Price (9 months DoD),Stock Price (10 months DoD),Stock Price (11 months DoD),Stock Price (12 months DoD),median stock forecast,mean stock forecast,Financial (Type of Info),Intrusion (Type of Info),Other (Type of Info),Personal (Type of Info),Exploit (Attack),Malware (Attack),Misconfiguration (Attack),Phishing (Attack),Ransomware (Attack),SQLi (Attack),Spoofing (Attack),Unauthorized Access (Attack),Virus (Attack)
0,HUMANA INC,HUM,10/12/2020 - 12/16/2020,2020-12-22 00:00:00,2021-02-25,,SSN | Name | DoB | Address | Phone Number | Em...,381.399994,414.660004,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,1,0
1,HERBALIFE NUTRITION LTD.,HLF,2020-10-01 00:00:00,,2021-02-23,,Name | Phone Number | Address | DOB | Membersh...,45.279999,46.299999,,,,,,,,,,,,,,0,0,1,1,0,0,0,0,0,0,0,1,0
2,"Ultra Clean Holdings, Inc.",UCTT,,,2021-02-23,,SSN | Driver's License,48.369999,55.090000,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,0,0
3,KROGER CO,KR,,2021-01-23 00:00:00,2021-02-19,,Name | Email | Phone Number | Address | DOB | ...,33.840000,34.689999,,,,,,,,,,,,,,0,0,0,1,0,0,0,0,0,0,0,1,0
4,CINTAS CORP,CTAS,9/16/2020 - 9/20/2020,2020-09-20 00:00:00,2021-02-15,,Name | Bank Account,345.100006,351.600006,,,,,,,,,,,,,,1,0,0,1,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
732,COSTCO WHOLESALE CORP /NEW,COST,6/19/2014 - 7/15/2015,,2015-09-23,815,Credit Card | Name | Address | Phone Number | ...,142.649994,158.149994,163.449997,161.229996,153.690002,150.059998,152.229996,149.919998,142.009995,158.119995,167.470001,167.940002,153.149994,157.5,158.416667,1,0,0,0,0,0,0,0,0,0,0,0,0
733,RITE AID CORP,RAD,8/20/2014 - 7/14/2015,2015-08-17 00:00:00,2015-09-22,,Credit Card | Name | Address | Phone Number | ...,146.600006,124.400002,154.199997,157.399994,154.199997,159.000000,159.800003,160.800003,153.199997,154.800003,137.199997,154.600006,162.000000,170.0,170.000000,1,0,0,0,0,0,0,0,0,0,0,0,0
734,CVS HEALTH Corp,CVS,6/19/2014 - 7/14/2015,2015-08-13 00:00:00,2015-09-11,,Credit Card | Name | Address | Phone Number | ...,100.800003,102.000000,97.839996,94.739998,94.919998,93.110001,99.339996,100.769997,106.099998,96.669998,96.839996,97.110001,91.519997,113.5,113.500000,1,0,0,0,0,0,0,0,0,0,0,0,0
735,WEYERHAEUSER CO,WY,May-2014 - 1/29/2015,2015-01-29 00:00:00,2015-03-17,11000000,Name | DoB | Address | Email | SSN | Other,33.680000,32.150002,32.340000,32.959999,30.959999,31.080000,28.450001,29.420000,30.139999,30.870001,25.820000,23.770000,29.450001,38.0,38.250000,0,0,0,1,0,0,0,1,0,0,0,0,0


In [44]:
# TODO: Calculate difference between actual and predicted stock price (usually 12 months into the future). 
# Need to figure out how to standardize this -> maybe divide difference by actual: (actual - predicted)/actual. 

# figure out how to do multiple linear regression

In [None]:
stock_prices = pd.DataFrame()
n = 1 
for x in col[1:]:
    stock_prices[n] = stock_price_aa_records.apply(lambda row: (row[x] - row[col[0]])/row[col[0]], axis = 1)
    n += 1

boxplot = sns.boxplot(x="variable", y = "value", data=pd.melt(stock_prices).dropna())
boxplot.set(xlabel="Months after Disclosure", ylabel='Percent Stock Price Change') # Where x month is percent change from start of breach
plt.show()

In [None]:
# It appears that, on average, stock price goes up
display(stock_prices.mean())

In [None]:
# Let's use median because there are some outliers. Same trend seems to occur
display(stock_prices.median())

In [None]:
# Let's take 5% trimmed mean (get rid of some outliers)
for x in range(1, months_after + 1):
    print("%s: %s" % (x, stats.trim_mean(stock_prices[x].dropna(), 0.05)))

In [None]:
# USE ALPHA VANTAGE to get missing entries?
# web.DataReader("BRK.A", "av-daily", start=datetime.datetime(2017, 2, 9), end=datetime.datetime(2017, 5, 24), api_key="")