## Packages

In [1]:
import os
import glob
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from time import time
#os.chdir('C:\\Users\\A Sua\\Documents\\FIU\\CAP2020')
os.chdir('C:\\Users\\Daniel\\Python and R Notebooks\\CAP2020\\CAP2020')

## Define Functions

In [2]:
# change to directory of raw 765 quarterly files
folder = "C:\\Users\\Daniel\\Python and R Notebooks\\CAP2020\\CAP2020\\data\\raw\\stock_quarterly_data"
data_subdir = "data\\interim\\compiled_quarterly_2.csv"
final_data = "data\\processed\\compiled_quarterly_final.csv"

In [3]:
#print computation time script
def timing(f):
    def wrap(*args):
        time1 = time()
        ret = f(*args)
        time2 = time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap

## Import Data

In [4]:
data = []
for file in Path(folder).glob('*.csv'):
        df = pd.read_csv(file)
        stock_name = os.path.basename(file)
        stock_name_noext = stock_name.replace("_quarterly_financial_data.csv", "")
        df['Symbol'] = stock_name_noext
        data.append(df)

In [5]:
bigframe = pd.concat(data)
print('initial shape {}'.format(bigframe.shape))

initial shape (62253, 42)


## Begin Cleaning

### Drop Duplicates

In [6]:
bigframe = bigframe.drop_duplicates()
print(print('new shape: {}'.format(bigframe.shape)))

new shape: (62253, 42)
None


### Replace 'None" with NaN

In [7]:
bigframe.replace('None', np.nan, inplace=True)

### Clean ticker names

### ABI_0 acquired by LIFE in Q4 2008, LIFE merged into TMO in Q4 2013
### CCE becomes CCE_1 smoothly in Q3 2010, becomes CCEP after Q2 2016 (no data)
### DG_1 is just DG (Dollar general)
### GAS(Nicor) acquired by GAS_0(AGL resources) in Q4 2011; GAS_0 acquired by SO in Q2 2016
### GCI_0 (Tegna) split from GCI in Q2 2015
### GGP_0 becomes GGP in 2010 (Q1-Q3 data not available), then GGP acquired by BPY in Q3 2018 (no BPY data)
### GM_0 becomes GM in Q3 2009 (Q2 2009 is skipped)
### LEH_0 acquired by BCS and NMH in Q3 2008 (both no data)
### MNST_0 should be MWW, acquired by Ranstad Holding in Q4 2016 (no data)
### PLD acquired by PLD_0 in Q2 2011
### Q acquired by CTL in Q1 2011
### Q_1 should be IQV
### WM_0 dissolved in Q3 2008

In [8]:
# replace ABI_0 with ABI
bigframe.replace({'Stock':r'^ABI_.$'}, {'Stock' : 'ABI'}, inplace = True, regex=True)

# replace CCE_1 with CCE
bigframe.replace({'Stock':r'^CCE_.$'}, {'Stock' : 'CCE'}, inplace = True, regex=True)

# DG_1 with DG
bigframe.replace({'Stock':r'^DG_.$'}, {'Stock' : 'DG'}, inplace = True, regex=True)

# replace gas_0 with GAS_new
bigframe.replace({'Stock':r'^GAS_.$'}, {'Stock' : 'GAS_new'}, inplace = True, regex=True)

# replace GAS with GAS_old
bigframe.replace({'Stock':r'^GAS$'}, {'Stock' : 'GAS_old'}, inplace = True, regex=True)

# replace GCI_0 with TEGNA
bigframe.replace({'Stock':r'^GCI_.$'}, {'Stock' : 'TEGNA'}, inplace = True, regex=True)

#replace GGP_0 with GGP
bigframe.replace({'Stock':r'^GGP_.$'}, {'Stock' : 'GGP'}, inplace = True, regex=True)

#replace GM_0 with GM
bigframe.replace({'Stock':r'^GM_.$'}, {'Stock' : 'GM'}, inplace = True, regex=True)

#replace LEH_0 with LEH
bigframe.replace({'Stock':r'^LEH_.$'}, {'Stock' : 'LEH'}, inplace = True, regex=True)

#replace MNST_0 with MWW
bigframe.replace({'Stock':r'^MNST_.$'}, {'Stock' : 'MWW'}, inplace = True, regex=True)

# replace Q_1 with IQV
bigframe.replace({'Stock':r'^Q_.$'}, {'Stock' : 'IQV'}, inplace = True, regex=True)

# replace WM_0 with WM
bigframe.replace({'Stock':r'^WM_.$'}, {'Stock' : 'WM'}, inplace = True, regex=True)

#replace PLD with PLD_old
bigframe.replace({'Stock':r'^PLD$'}, {'Stock' : 'PLD_old'}, inplace = True, regex=True)

#replace PLD_0 with PLD_new
bigframe.replace({'Stock':r'^PLD_.$'}, {'Stock' : 'PLD_new'}, inplace = True, regex=True)

In [9]:
bigframe.to_csv(data_subdir)

### Transform Columns into Proper Datatypes

#### P/E Ratio - The ratio of Price to EPS diluted TTM as of the previous quarter.
#### EPS basic - Basic earnings per share for a given quarter
#### EPS diluted - Diluted earnings per share.
#### Dividend per share - Common stock dividends paid during a quarter per share, including all regular and special dividends and distributions to common shareholders.
#### Price - The medium price per share of the company common stock during a given quarter. The prices are as reported, and are not adjusted for subsequent dividends.
#### Long-term debt to equity ratio - The ratio of Long-term debt to common shareholders' equity (Shareholders equity minus Preferred equity).
#### Dividend payout ratio - The ratio of Dividends TTM to Earnings (available to common stockholders) TTM.
#### Shares split adjusted - The number of shares the company had at the end of a given quarter, adjusted for splits to be comparable to today's shares.

In [10]:
bigframe['P/E ratio'] = bigframe['P/E ratio'].astype(float)
bigframe['EPS basic'] = bigframe['EPS basic'].astype(float)
bigframe['EPS diluted'] = bigframe['EPS diluted'].astype(float)
bigframe['Dividend per share'] = bigframe['Dividend per share'].astype(float)
bigframe['Price'] = bigframe['Price'].astype(float)
bigframe['Long-term debt to equity ratio'] = bigframe['Long-term debt to equity ratio'].astype(float)
bigframe['Dividend payout ratio'] = bigframe['Dividend payout ratio'].astype(float)
bigframe['Shares split adjusted'] = bigframe['Shares split adjusted'].astype(float)

#### Quarter end

In [11]:
bigframe['Quarter end'] = pd.to_datetime(bigframe['Quarter end'], errors='ignore')
bigframe.head()

Unnamed: 0,Quarter end,Shares,Shares split adjusted,Split factor,Assets,Current Assets,Liabilities,Current Liabilities,Shareholders equity,Non-controlling interest,...,P/E ratio,Cumulative dividends per share,Dividend payout ratio,Long-term debt to equity ratio,Equity to assets ratio,Net margin,Asset turnover,Free cash flow per share,Current ratio,Symbol
0,2019-09-30,67151778,67151778.0,1.0,3219620000,,1351965000,,1867655000,0,...,21.69,1.04,0.0488,0.1859,0.5801,0.0509,1.26,1.32,,AAN
1,2019-06-30,67547497,67547497.0,1.0,3180198000,,1330640000,,1849558000,0,...,19.94,1.01,0.0464,0.188,0.5816,0.052,1.31,0.84,,AAN
2,2019-03-31,67677449,67677449.0,1.0,3237966000,,1424458000,,1813508000,0,...,17.49,0.97,0.0459,0.2251,0.5601,0.0515,1.37,2.09,,AAN
3,2018-12-31,67202919,67202919.0,1.0,2826692000,,1065984000,,1760708000,0,...,10.83,0.94,0.0455,0.1937,0.6229,0.0512,1.41,-0.43,,AAN
4,2018-09-30,68594784,68594784.0,1.0,2698898000,,935392000,,1763506000,0,...,12.0,0.9,0.0267,0.1686,0.6534,0.0839,1.39,1.13,,AAN


#### Symbol

In [12]:
bigframe['Symbol'] = bigframe['Symbol'].astype(str)

In [13]:
finalframe = bigframe[['Quarter end', 'Shares split adjusted', 'Price', 'EPS basic', 'EPS diluted', 'Dividend per share', 
                       'Dividend payout ratio', 'P/E ratio', 'Long-term debt to equity ratio', 'Symbol']]
finalframe.head()

Unnamed: 0,Quarter end,Shares split adjusted,Price,EPS basic,EPS diluted,Dividend per share,Dividend payout ratio,P/E ratio,Long-term debt to equity ratio,Symbol
0,2019-09-30,67151778.0,63.99,0.59,0.58,0.035,0.0488,21.69,0.1859,AAN
1,2019-06-30,67547497.0,57.23,0.63,0.62,0.035,0.0464,19.94,0.188,AAN
2,2019-03-31,67677449.0,48.63,0.83,0.82,0.035,0.0459,17.49,0.2251,AAN
3,2018-12-31,67202919.0,47.0,0.9,0.89,0.04,0.0455,10.83,0.1937,AAN
4,2018-09-30,68594784.0,48.86,0.64,0.62,0.03,0.0267,12.0,0.1686,AAN


### Convert Dates to Quarters

In [14]:
#finalframe = pd.read_csv(final_data)
#finalframe.drop(columns = ['Unnamed: 0'], inplace = True)
finalframe.columns

Index(['Quarter end', 'Shares split adjusted', 'Price', 'EPS basic',
       'EPS diluted', 'Dividend per share', 'Dividend payout ratio',
       'P/E ratio', 'Long-term debt to equity ratio', 'Symbol'],
      dtype='object')

In [15]:
finalframe['Quarter'] = pd.PeriodIndex(finalframe['Quarter end'], freq = 'Q')
finalframe.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Quarter end,Shares split adjusted,Price,EPS basic,EPS diluted,Dividend per share,Dividend payout ratio,P/E ratio,Long-term debt to equity ratio,Symbol,Quarter
0,2019-09-30,67151778.0,63.99,0.59,0.58,0.035,0.0488,21.69,0.1859,AAN,2019Q3
1,2019-06-30,67547497.0,57.23,0.63,0.62,0.035,0.0464,19.94,0.188,AAN,2019Q2
2,2019-03-31,67677449.0,48.63,0.83,0.82,0.035,0.0459,17.49,0.2251,AAN,2019Q1
3,2018-12-31,67202919.0,47.0,0.9,0.89,0.04,0.0455,10.83,0.1937,AAN,2018Q4
4,2018-09-30,68594784.0,48.86,0.64,0.62,0.03,0.0267,12.0,0.1686,AAN,2018Q3


## Compile and Export

In [16]:
finalframe.to_csv(final_data)