# Cleaning File
Get the basic company information here.

In [1]:
import pickle
import pandas as pd
import matplotlib
import os
import re

import collections
import datetime
import time

import geopandas as gpd

import numpy as np

from difflib import get_close_matches

from fuzzywuzzy import process
from fuzzywuzzy import fuzz

# Stock price data

In [None]:
file = "../../data/crsp_raw.csv"
raw = pd.read_csv(file, encoding = 'unicode_escape')

In [None]:
raw.head()

Let's follow Evan's crsp cleaning and linking process.

1. Drop firms not traded on NYSE or NASDAQ.

In [None]:
other_exch = raw[ (raw['PRIMEXCH'] != 'Q') & (raw['PRIMEXCH'] != 'N') ].index
raw.drop(other_exch, inplace=True)
raw.shape

2. Drop entries with missing return or volume information.

In [None]:
raw.dropna(subset=['VOL', 'RET','date'],inplace=True)
raw['VOL'] = raw['VOL'].astype('int')
raw.drop(raw[raw['RET']=='C'].index,inplace=True)
raw['RET'] = raw['RET'].astype('float')
raw['date'] = raw['date'].astype('int')

3. Drop tickers with average daily trading volume less than 100K shares.

In [None]:
adv = raw[['VOL','TICKER']].groupby(['TICKER']).mean()
low_vol = adv.drop(adv[adv['VOL']>=100000].index)
low_vol_index = raw[raw['TICKER'].isin(low_vol.index)].index
raw.drop(low_vol_index,inplace=True)
raw.shape

4. Drop tickers that traded for less than 20% of the period

In [None]:
ndays = len(raw.date.unique())
obs = raw['TICKER'].value_counts()
rare_obs = obs[obs<0.2*ndays]
rare_index = raw[raw['TICKER'].isin(rare_obs.index)].index
raw.drop(rare_index,inplace=True)
raw.shape

In [None]:
raw = raw.drop_duplicates()

In [None]:
raw.shape

Merge in GVKEY information for linking with compustat.

gv_cik will come in hand with another merge maybe; but to get to compustat, looks like we just need gvkey - permco.

Some of the keys seem to be used twice. Let's see if they're re-used when a company has gone out of business.

In [None]:
# gv_cik = pd.read_csv("data/cik_gvkey.csv",dtype={'cik':str})[['cik','gvkey']]
gv_permco = pd.read_csv("../../data/cik_permco.csv",dtype={'cik':str})
# [['gvkey','LPERMCO']].rename(columns = {'LPERMCO': 'PERMCO'}).drop_duplicates()

gv_permco.loc[gv_permco.LINKENDDT == 'E', 'LINKENDDT'] = '20210101'
gv_permco['existsCurrently'] = pd.DatetimeIndex(pd.to_datetime(gv_permco.LINKENDDT.astype(str), format='%Y%m%d')).year

gv_permco = gv_permco[gv_permco.existsCurrently > 2010]
print(gv_permco.shape)
print(gv_permco.head())

gv_permco = gv_permco[['gvkey','LPERMCO','conm']].rename(columns = {'LPERMCO': 'PERMCO'}).drop_duplicates()

In [None]:
for company in gv_permco[gv_permco.PERMCO.duplicated()].conm:
    print(company)

Drop the duplicated entries. We may want to revisit this later on.

In [None]:
gv_permco = gv_permco[~gv_permco.PERMCO.isin(list(gv_permco.PERMCO[gv_permco.PERMCO.duplicated()]))]

Now just merge the raw return information with this company information so we can link everything.

In [None]:
rawMergeable = pd.merge(raw,gv_permco,on='PERMCO',how = 'left')
rawMergeable.shape

In [None]:
raw.head()

In [None]:
rawMergeable = rawMergeable.drop(['RETX'],axis = 1)
rawMergeable['date'] = pd.to_datetime(rawMergeable['date'].astype(str), format='%Y%m%d')
rawMergeable['year'] = pd.DatetimeIndex(rawMergeable['date']).year
rawMergeable.head()

In [None]:
outfile =  '../../data/stockReturns.pkl'
with open(outfile, 'wb') as pickle_file:
    pickle.dump(rawMergeable, pickle_file)

----------------------------------------------

In [None]:
rawMergeable.year.value_counts()

In [None]:
crsp2010s = crsp[['date','year','PERMNO','COMNAM','PRC']]

In [None]:
sum(crsp.RET == crsp.RETX)