# Preppin' Data 2023 Week 8

source: https://preppindata.blogspot.com/2023/02/2023-week-8-taking-stock.html

#### Load data

In [1]:
import glob
import pandas as pd

In [2]:
# read all files into separate df and concat
files = glob.glob('Input/*csv')
dfs = []
for file in files:
    df = pd.read_csv(file)
    df['filename'] = file
    dfs.append(df)
df = pd.concat(dfs, ignore_index=True)

In [3]:
# check values
df

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,filename
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,$78924.65,Input\MOCK_DATA-10.csv
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,$89818.72,Input\MOCK_DATA-10.csv
2,3,Mignon,Blenkinsop,CPTAG,,NASDAQ,Capitala Finance Corp.,,$53160.64,Input\MOCK_DATA-10.csv
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,$23636.92,Input\MOCK_DATA-10.csv
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,$65979.23,Input\MOCK_DATA-10.csv
...,...,...,...,...,...,...,...,...,...,...
11995,996,Christel,Sarrell,CLNS^A.CL,,NYSE,"Colony NorthStar, Inc.",,$60683.35,Input\MOCK_DATA.csv
11996,997,Rivi,Rame,EMCB,,NASDAQ,WisdomTree Emerging Markets Corporate Bond Fund,$50.49M,$55605.12,Input\MOCK_DATA.csv
11997,998,Doti,Facer,SXT,Basic Industries,NYSE,Sensient Technologies Corporation,$3.62B,$63432.98,Input\MOCK_DATA.csv
11998,999,Dorothy,Janauschek,KRNT,Capital Goods,NASDAQ,Kornit Digital Ltd.,$643.74M,$97859.28,Input\MOCK_DATA.csv


#### Create date

In [4]:
# get month of file name
df['File Date'] = df['filename'].str.split('-', expand=True)[1]

In [5]:
# get month of file name
df['File Date'] = df['File Date'].str.split('.', expand=True)[0]

In [6]:
# replace null with 1 for jan
df.loc[df['filename'] == 'Input\MOCK_DATA.csv', 'File Date'] = 1

In [7]:
df = df.rename(columns = {'File Date':'Month'})

In [8]:
df

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,filename,Month
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,$78924.65,Input\MOCK_DATA-10.csv,10
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,$89818.72,Input\MOCK_DATA-10.csv,10
2,3,Mignon,Blenkinsop,CPTAG,,NASDAQ,Capitala Finance Corp.,,$53160.64,Input\MOCK_DATA-10.csv,10
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,$23636.92,Input\MOCK_DATA-10.csv,10
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,$65979.23,Input\MOCK_DATA-10.csv,10
...,...,...,...,...,...,...,...,...,...,...,...
11995,996,Christel,Sarrell,CLNS^A.CL,,NYSE,"Colony NorthStar, Inc.",,$60683.35,Input\MOCK_DATA.csv,1
11996,997,Rivi,Rame,EMCB,,NASDAQ,WisdomTree Emerging Markets Corporate Bond Fund,$50.49M,$55605.12,Input\MOCK_DATA.csv,1
11997,998,Doti,Facer,SXT,Basic Industries,NYSE,Sensient Technologies Corporation,$3.62B,$63432.98,Input\MOCK_DATA.csv,1
11998,999,Dorothy,Janauschek,KRNT,Capital Goods,NASDAQ,Kornit Digital Ltd.,$643.74M,$97859.28,Input\MOCK_DATA.csv,1


In [9]:
df['File Date'] = pd.to_datetime(dict(year='2023',month=df['Month'],day='01'))

In [10]:
df = df.drop(['filename','Month'], axis=1)

In [11]:
df

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,File Date
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,$78924.65,2023-10-01
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,$89818.72,2023-10-01
2,3,Mignon,Blenkinsop,CPTAG,,NASDAQ,Capitala Finance Corp.,,$53160.64,2023-10-01
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,$23636.92,2023-10-01
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,$65979.23,2023-10-01
...,...,...,...,...,...,...,...,...,...,...
11995,996,Christel,Sarrell,CLNS^A.CL,,NYSE,"Colony NorthStar, Inc.",,$60683.35,2023-01-01
11996,997,Rivi,Rame,EMCB,,NASDAQ,WisdomTree Emerging Markets Corporate Bond Fund,$50.49M,$55605.12,2023-01-01
11997,998,Doti,Facer,SXT,Basic Industries,NYSE,Sensient Technologies Corporation,$3.62B,$63432.98,2023-01-01
11998,999,Dorothy,Janauschek,KRNT,Capital Goods,NASDAQ,Kornit Digital Ltd.,$643.74M,$97859.28,2023-01-01


#### Clean the Market Cap

In [12]:
# drop rows if null
df = df.dropna(subset=['Market Cap'])

In [13]:
# drop n/a
df = df.drop(df[df['Market Cap'] == 'n/a'].index)

In [14]:
df

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Cap,Purchase Price,File Date
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,$78924.65,2023-10-01
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,$89818.72,2023-10-01
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,$23636.92,2023-10-01
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,$65979.23,2023-10-01
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",$687.82M,$41824.21,2023-10-01
...,...,...,...,...,...,...,...,...,...,...
11994,995,Ben,Matuszak,ENBL,Public Utilities,NYSE,"Enable Midstream Partners, LP",$6.43B,$14609.25,2023-01-01
11996,997,Rivi,Rame,EMCB,,NASDAQ,WisdomTree Emerging Markets Corporate Bond Fund,$50.49M,$55605.12,2023-01-01
11997,998,Doti,Facer,SXT,Basic Industries,NYSE,Sensient Technologies Corporation,$3.62B,$63432.98,2023-01-01
11998,999,Dorothy,Janauschek,KRNT,Capital Goods,NASDAQ,Kornit Digital Ltd.,$643.74M,$97859.28,2023-01-01


In [15]:
df = df.rename(columns = {'Market Cap':'Market Capitalisation'})

In [16]:
df

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Capitalisation,Purchase Price,File Date
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,$78924.65,2023-10-01
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,$89818.72,2023-10-01
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,$23636.92,2023-10-01
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,$65979.23,2023-10-01
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",$687.82M,$41824.21,2023-10-01
...,...,...,...,...,...,...,...,...,...,...
11994,995,Ben,Matuszak,ENBL,Public Utilities,NYSE,"Enable Midstream Partners, LP",$6.43B,$14609.25,2023-01-01
11996,997,Rivi,Rame,EMCB,,NASDAQ,WisdomTree Emerging Markets Corporate Bond Fund,$50.49M,$55605.12,2023-01-01
11997,998,Doti,Facer,SXT,Basic Industries,NYSE,Sensient Technologies Corporation,$3.62B,$63432.98,2023-01-01
11998,999,Dorothy,Janauschek,KRNT,Capital Goods,NASDAQ,Kornit Digital Ltd.,$643.74M,$97859.28,2023-01-01


#### Categorise the Purchase Price into groupings

In [17]:
# remove dollar sign
df['Purchase Price'] = df['Purchase Price'].str.split('$',expand=True)[1].astype(float)

In [18]:
def purchase_price_cat(x):
    if x['Purchase Price'] > 0.00 and x['Purchase Price'] <= 24999.99:
        return 'Small'
    elif x['Purchase Price'] <= 49999.99:
        return 'Medium'
    elif x['Purchase Price'] <= 74999.99:
        return 'Large'
    elif x['Purchase Price'] <= 100000.00:
        return 'Very Large'

In [19]:
df['Purchase Price Category'] = df.apply(lambda x: purchase_price_cat(x), axis=1)

In [20]:
df.head()

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Capitalisation,Purchase Price,File Date,Purchase Price Category
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",$458.83M,78924.65,2023-10-01,Very Large
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,$1.14B,89818.72,2023-10-01,Very Large
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,$1.94B,23636.92,2023-10-01,Small
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,$101.07M,65979.23,2023-10-01,Large
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",$687.82M,41824.21,2023-10-01,Medium


#### Categorise the Market Cap into groupings

In [21]:
# remove dollar sign
df['Market Capitalisation'] = df['Market Capitalisation'].str.split('$',expand=True)[1]

In [22]:
# get billions or millions
df['Market Cap Agg'] = df['Market Capitalisation'].str[-1:]

In [23]:
# remove letter
df['Market Capitalisation'] = df['Market Capitalisation'].str[:-1]

In [24]:
def market_cap_val(x):
    """function to expand the value based on aggregation"""
    if x['Market Cap Agg'] == 'B':
        return x['Market Capitalisation'] * 1000000000
    elif x['Market Cap Agg'] == 'M':
        return x['Market Capitalisation'] * 1000000

In [25]:
# convert to float
df['Market Capitalisation'] = df['Market Capitalisation'].astype(float)

In [26]:
# apply function
df['Market Cap Value'] = df.apply(lambda x: market_cap_val(x), axis=1)

In [27]:
df.head()

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Market Capitalisation,Purchase Price,File Date,Purchase Price Category,Market Cap Agg,Market Cap Value
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",458.83,78924.65,2023-10-01,Very Large,M,458830000.0
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,1.14,89818.72,2023-10-01,Very Large,B,1140000000.0
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,1.94,23636.92,2023-10-01,Small,B,1940000000.0
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,101.07,65979.23,2023-10-01,Large,M,101070000.0
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",687.82,41824.21,2023-10-01,Medium,M,687820000.0


In [28]:
# drop columns
df = df.drop(['Market Cap Agg','Market Capitalisation'], axis=1)

In [29]:
df = df.rename(columns={'Market Cap Value':'Market Capitalisation'})

In [30]:
df.head()

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Purchase Price,File Date,Purchase Price Category,Market Capitalisation
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",78924.65,2023-10-01,Very Large,458830000.0
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,89818.72,2023-10-01,Very Large,1140000000.0
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,23636.92,2023-10-01,Small,1940000000.0
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,65979.23,2023-10-01,Large,101070000.0
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",41824.21,2023-10-01,Medium,687820000.0


#### Categorise the Market Cap into groupings

In [31]:
def market_cap_cat(x):
    if x['Market Capitalisation'] < 100000000:
        return 'Small'
    elif x['Market Capitalisation'] < 1000000000:
        return 'Medium'
    elif x['Market Capitalisation'] < 100000000000:
        return 'Large'
    elif x['Market Capitalisation'] >= 100000000000:
        return 'Huge'

In [32]:
df['Market Capitalisation Category'] = df.apply(lambda x: market_cap_cat(x), axis=1)

In [33]:
df.head()

Unnamed: 0,id,first_name,last_name,Ticker,Sector,Market,Stock Name,Purchase Price,File Date,Purchase Price Category,Market Capitalisation,Market Capitalisation Category
0,1,Vikki,Pascall,RIC,Basic Industries,NYSE,"Richmont Mines, Inc.",78924.65,2023-10-01,Very Large,458830000.0,Medium
1,2,Beulah,Michallat,ELY,Consumer Non-Durables,NYSE,Callaway Golf Company,89818.72,2023-10-01,Very Large,1140000000.0,Large
3,4,Willis,Bugg,OAS,Energy,NYSE,Oasis Petroleum Inc.,23636.92,2023-10-01,Small,1940000000.0,Large
4,5,Balduin,Maffei,QQXT,,NASDAQ,First Trust NASDAQ-100 Ex-Technology Sector In...,65979.23,2023-10-01,Large,101070000.0,Medium
5,6,Octavia,Rawll,IMKTA,Consumer Services,NASDAQ,"Ingles Markets, Incorporated",41824.21,2023-10-01,Medium,687820000.0,Medium


#### Rank the highest 5 purchases per combination of: file date, Purchase Price Categorisation and Market Capitalisation Categorisation.

In [34]:
df['Rank'] = df.groupby(['File Date', 'Purchase Price Category','Market Capitalisation Category'])['Purchase Price'].rank(ascending=False)

#### Output only records with a rank of 1 to 5

In [35]:
df2 = df[df['Rank']<=5]

In [36]:
df2 = df2.drop(['id','first_name','last_name'], axis=1)

In [37]:
df2.to_csv('2023W08_output.csv', index=False)