In [1]:
import pandas as pd
df1 = pd.read_csv('../../dataset/historical_stock_prices.csv')
df2 = pd.read_csv('../../dataset/historical_stocks.csv')
df1.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14


### Read both datasets

In [2]:
df2.head()

Unnamed: 0,ticker,exchange,name,sector,industry
0,PIH,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
1,PIHPP,NASDAQ,"1347 PROPERTY INSURANCE HOLDINGS, INC.",FINANCE,PROPERTY-CASUALTY INSURERS
2,TURN,NASDAQ,180 DEGREE CAPITAL CORP.,FINANCE,FINANCE/INVESTORS SERVICES
3,FLWS,NASDAQ,"1-800 FLOWERS.COM, INC.",CONSUMER SERVICES,OTHER SPECIALTY STORES
4,FCCY,NASDAQ,1ST CONSTITUTION BANCORP (NJ),FINANCE,SAVINGS INSTITUTIONS


In [3]:
df1.shape

(20973889, 8)

In [4]:
df2.shape

(6460, 5)

### Merge the datasets

In [5]:
df = pd.merge(df1, df2, on = 'ticker')

In [6]:
df.shape

(20973889, 12)

In [7]:
df.head()

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date,exchange,name,sector,industry
0,AHH,11.5,11.58,8.493155,11.25,11.68,4633900,2013-05-08,NYSE,"ARMADA HOFFLER PROPERTIES, INC.",FINANCE,REAL ESTATE
1,AHH,11.66,11.55,8.471151,11.5,11.66,275800,2013-05-09,NYSE,"ARMADA HOFFLER PROPERTIES, INC.",FINANCE,REAL ESTATE
2,AHH,11.55,11.6,8.507822,11.5,11.6,277100,2013-05-10,NYSE,"ARMADA HOFFLER PROPERTIES, INC.",FINANCE,REAL ESTATE
3,AHH,11.63,11.65,8.544494,11.55,11.65,147400,2013-05-13,NYSE,"ARMADA HOFFLER PROPERTIES, INC.",FINANCE,REAL ESTATE
4,AHH,11.6,11.53,8.456484,11.5,11.6,184100,2013-05-14,NYSE,"ARMADA HOFFLER PROPERTIES, INC.",FINANCE,REAL ESTATE


### Get percent change for company name

In [8]:
def getPercentChangeForCompanyName(dataFrame, companyName, year):
    dfCopy = dataFrame.copy()
    companyDf = dfCopy[dfCopy['name'] == companyName]
    companyDf = companyDf.sort_values('date')
    companyDfInSelectedYear = companyDf[companyDf['date'].str.startswith(year)]

    pd.options.mode.chained_assignment = None
    companyDfInSelectedYear['year'] = companyDfInSelectedYear.date.str[0:4]

    companyDfInSelectedYearGroupedFirstDate = companyDfInSelectedYear.groupby(['ticker', 'year']).first()
    startingClosePrice = companyDfInSelectedYearGroupedFirstDate['close'].sum()
    companyDfInSelectedYearGroupedLastDate = companyDfInSelectedYear.groupby(['ticker', 'year']).last()
    finalClosePrice = companyDfInSelectedYearGroupedLastDate['close'].sum()

    percentDifference = (finalClosePrice - startingClosePrice)/startingClosePrice
    return percentDifference


In [9]:
percentChange = getPercentChange(df, 'NEW YORK MORTGAGE TRUST, INC.', '2016')
int(round(percentChange*100))

26

In [10]:
percentChange = getPercentChange(df, 'NEW YORK MORTGAGE TRUST, INC.', '2017')
int(round(percentChange*100))

4

In [11]:
percentChange = getPercentChange(df, 'NEW YORK MORTGAGE TRUST, INC.', '2018')
int(round(percentChange*100))

-1

### Check if there are tickers which are in both exchanges

In [12]:
groupedDf = df.groupby("ticker")["exchange"].nunique()
groupedDf.head()

ticker
A       1
AA      1
AABA    1
AAC     1
AAL     1
Name: exchange, dtype: int64

In [13]:
type(groupedDf)

pandas.core.series.Series

In [15]:
groupedDf.describe()

1.0

### Check if there are tickers which have more than one sector

In [16]:
groupedDf = df.groupby("ticker")["sector"].nunique()
groupedDf.head()

ticker
A       1
AA      1
AABA    1
AAC     1
AAL     1
Name: sector, dtype: int64

In [17]:
groupedDf.mean()

0.838698328935796

In [18]:
df[df['ticker'] == 'AAXJ']

Unnamed: 0,ticker,open,close,adj_close,low,high,volume,date,exchange,name,sector,industry
5528311,AAXJ,50.130001,50.130001,42.221111,50.130001,50.130001,100,2008-08-15,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528312,AAXJ,49.770000,49.000000,41.269379,49.000000,49.770000,25600,2008-08-18,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528313,AAXJ,48.340000,48.340000,40.713501,48.340000,48.340000,100,2008-08-19,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528314,AAXJ,49.049999,49.020000,41.286217,49.020000,49.049999,4000,2008-08-20,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528315,AAXJ,49.250000,49.220001,41.454670,49.160000,49.250000,3600,2008-08-22,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528316,AAXJ,49.189999,48.299999,40.679825,48.299999,49.189999,17000,2008-08-25,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528317,AAXJ,48.919998,48.759998,41.067245,48.549999,48.919998,5000,2008-08-26,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528318,AAXJ,49.770000,49.650002,41.816837,49.650002,49.770000,1500,2008-08-27,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528319,AAXJ,49.610001,49.669998,41.833675,49.610001,49.669998,6000,2008-08-28,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,
5528320,AAXJ,49.389999,49.330002,41.547314,49.330002,49.389999,400,2008-08-29,NASDAQ,ISHARES MSCI ALL COUNTRY ASIA EX JAPAN INDEX FUND,,


So there are ticker (ticker symbol) which don't have a sector

In [19]:
groupedDf.describe()

count    5685.000000
mean        0.838698
std         0.367841
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: sector, dtype: float64

Also, each ticker symbol has at most one corresponding sector

### Check if there are company (company name) which have more than one sector

In [20]:
groupedDf = df.groupby("name")["sector"].nunique()
groupedDf.head()

name
1-800 FLOWERS.COM, INC.                   1
1347 PROPERTY INSURANCE HOLDINGS, INC.    1
180 DEGREE CAPITAL CORP.                  1
1ST CONSTITUTION BANCORP (NJ)             1
1ST SOURCE CORPORATION                    1
Name: sector, dtype: int64

In [21]:
groupedDf.describe()

count    5376.000000
mean        0.837426
std         0.370018
min         0.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         2.000000
Name: sector, dtype: float64

In [22]:
groupedDf.idxmax()

'ENERGIZER HOLDINGS, INC.'

In [25]:
list(df[df['name'] == 'ENERGIZER HOLDINGS, INC.'].groupby(['name', 'sector']).groups.keys())

[('ENERGIZER HOLDINGS, INC.', 'CONSUMER NON-DURABLES'),
 ('ENERGIZER HOLDINGS, INC.', 'MISCELLANEOUS')]

In [26]:
groupedDf[groupedDf == 2]

name
ENERGIZER HOLDINGS, INC.         2
NEW YORK MORTGAGE TRUST, INC.    2
Name: sector, dtype: int64

In [28]:
list(df[df['name'] == 'NEW YORK MORTGAGE TRUST, INC.'].groupby(['name', 'sector']).groups.keys())

[('NEW YORK MORTGAGE TRUST, INC.', 'CONSUMER SERVICES'),
 ('NEW YORK MORTGAGE TRUST, INC.', 'FINANCE')]

### get percent change per sector in a given year

In [52]:
def getPercentChangeForSector(dataFrame, sector, year):
    dfCopy = dataFrame.copy()
    sectorDf = dfCopy[dfCopy['sector'] == sector]
    sectorDf = sectorDf.sort_values('date')
    sectorDfInSelectedYear = sectorDf[sectorDf['date'].str.startswith(year)]

    pd.options.mode.chained_assignment = None
    sectorDfInSelectedYear['year'] = sectorDfInSelectedYear.date.str[0:4]

    sectorDfInSelectedYearGroupedFirstDate = sectorDfInSelectedYear.groupby(['ticker', 'year']).first()
    startingClosePrice = sectorDfInSelectedYearGroupedFirstDate['close'].sum()
    sectorDfInSelectedYearGroupedLastDate = sectorDfInSelectedYear.groupby(['ticker', 'year']).last()
    finalClosePrice = sectorDfInSelectedYearGroupedLastDate['close'].sum()

    percentDifference = (finalClosePrice - startingClosePrice)/startingClosePrice
    return percentDifference


In [53]:
getPercentChangeForSector(df, 'BASIC INDUSTRIES', '2004')

0.228150473062799

In [54]:
getPercentChangeForSector(df, 'BASIC INDUSTRIES', '2013')

0.10322636195480321

In [56]:
getPercentChangeForSector(df, 'MISCELLANEOUS', '2018')

0.10398556393167135

In [57]:
getPercentChangeForSector(df, 'MISCELLANEOUS', '2004')

0.1409865124005556