# Young Audiences Data

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
# Reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import pandas as pd
import glob
from collections import defaultdict
sys.path.append(os.path.dirname(os.getcwd())+'/utils')

In [14]:
from helpers import clean_artist_count, clean_sales_data, group_sales_data
from config import REMOVE_ARTIST_IDS, COLS, CMP_FILT

In [6]:
# Retrieve any arguments if available
args = sys.argv # date

In [7]:
args

['/Users/jjgong/.pyenv/versions/3.9.5/lib/python3.9/site-packages/ipykernel_launcher.py',
 '--ip=127.0.0.1',
 '--stdin=9018',
 '--control=9016',
 '--hb=9015',
 '--Session.signature_scheme="hmac-sha256"',
 '--Session.key=b"9c2122e5-29e2-4765-9722-eb792a4577c2"',
 '--shell=9017',
 '--transport="tcp"',
 '--iopub=9019',
 '--f=/var/folders/wl/qjpb81t50cs9n66dtcwwbw240000gn/T/tmp-4898yB8Q7qn58BOq.json']

In [10]:

# # Retrieve times as strings
# time_start = pd.Timestamp(args[1]) if 1 < len(args) else pd.Timestamp(2019,1,1)
# time_end = pd.Timestamp(args[2]) if 2 < len(args) else pd.Timestamp(2020,1,1)

time_start = pd.Timestamp(2019,1,1)
time_end = pd.Timestamp(2020,1,1)

path = os.path.dirname(os.path.dirname(os.getcwd()))+'/data/raw/'

# Use Glob to retrieve all of the file names
file_list = glob.glob(path+'*.xlsx')

# Upload files
d = defaultdict(lambda: defaultdict())
for i,f in enumerate(file_list):
    d[file_list[i][len(path):-5]] = pd.read_excel(f)


# Retrieve Three Year Sales Data
three_year = d['three_year_sales'].copy()
# Clean data
three_year = clean_sales_data(three_year)
# Group sales data
three_yr_grp = group_sales_data(three_year)

# Retrieve demographic data
demographic = d['demographics'].copy()
# Remove Unamed column
demographic = demographic.drop(labels=['Unnamed: 0'], axis=1)
# Drop duplicates - Artist ID 133 is duplicated
demographic = demographic.drop_duplicates(subset=['artist_id'], keep='last').reset_index(drop=True)

# Join Sales and Demographic data
joined = three_yr_grp.merge(demographic, how='outer', left_on='Artist Payroll ID', right_on='artist_id')

# Include only wanted Artist IDS - Remove Artist ID: 0
inc_df = joined[~joined['Artist Payroll ID'].isin(REMOVE_ARTIST_IDS)]

# Sales data with no demographic data
null_demo = inc_df[pd.isnull(inc_df['artist_id'])].copy().reset_index(drop=True)

# Sales data with demographic data
demo = inc_df[~(pd.isnull(inc_df['artist_id'])) & ~(pd.isnull(inc_df['Artist Payroll ID']))].copy().reset_index(drop=True)

# Retrieve artist group size data
grp_size = d['artist_count'].copy()
grp_size = clean_artist_count(grp_size)

# Filter data by date
df = demo[(demo['Date']>=time_start) & (demo['Date']<time_end)].copy().reset_index(drop=True)

# Merge demo and sales data with group size data
df_merged = df.merge(grp_size, on='artist_id')

df_merged = df_merged[COLS]

Number of NULL Artist IDs removed: 4


In [23]:
CMP_FILT[1]=='Workshop'

True

In [19]:
d_cmp = defaultdict()
# Iterate through the component types 

for c in CMP_FILT:
    df_tmp = df_merged[df_merged['Component Type'].str.contains(c)].copy().reset_index(drop=True)

    # Check if workshop otherwise filter by performance
    if c=='Workshop':
        d_cmp['{}_ind'.format(c)] = df_tmp[df_tmp['workshop']==1]
        d_cmp['{}_grp'.format(c)] = df_tmp[df_tmp['workshop']>1] 
    else:
        d_cmp['{}_ind'.format(c)] = df_tmp[df_tmp['performance']==1]
        d_cmp['{}_grp'.format(c)] = df_tmp[df_tmp['performance']>1]

In [20]:
d_cmp['Performance']

Unnamed: 0,artist_account_name,artist_id,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,...,respondent_id,age,gender,city,state,company,ethnicity,multi_ind,performance,workshop
0,Rob Aptaker,16.0,Theater,Assembly,2019-01-11,19-0008149,07645,Memorial Elem School-Montvale,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
1,Rob Aptaker,16.0,Theater,Assembly,2019-01-18,19-0007901,07930,Bragg Elem School,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
2,Rob Aptaker,16.0,Theater,Assembly,2019-01-25,19-0008152,07649,Oradell Public School,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
3,Rob Aptaker,16.0,Theater,Assembly,2019-01-30,19-0007818,07661,Cherry Hill Elem School,Discount,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
4,Rob Aptaker,16.0,Theater,Assembly,2019-01-30,19-0007942,07661,Roosevelt School,Discount,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,123 Andrés,130.0,Music,Assembly,2019-09-10,20-0008665,19808,Anna P Mote Elem,Program Services,Performance/Demo,...,1.252399e+10,35 to 44,female,Sterling,VA,123 Andrés,latinx,1.0,2.0,2
534,Bobby Beetcut,133.0,Music,Assembly,2019-09-20,20-0008648,08361,All Kids First Preschool,Program Services,Performance/Demo,...,1.220217e+10,45 to 54,unknown,Vernon,NJ,Bobby Beetcut/ Junk Jam Band,caucasian,0.0,1.0,1
535,Bobby Beetcut,133.0,Music,Assembly,2019-09-24,20-0008827,08869,J F Kennedy Primary School,Program Services,Performance/Demo,...,1.220217e+10,45 to 54,unknown,Vernon,NJ,Bobby Beetcut/ Junk Jam Band,caucasian,0.0,1.0,1
536,Bobby Beetcut,133.0,Music,Assembly,2019-10-03,20-0008766,07076,Evergreen Elem School,Showcase,Performance/Demo,...,1.220217e+10,45 to 54,unknown,Vernon,NJ,Bobby Beetcut/ Junk Jam Band,caucasian,0.0,1.0,1


In [17]:
d_cmp.keys()

dict_keys(['Performance', 'Workshop', 'Learning', 'Meeting'])

In [11]:
df_merged

Unnamed: 0,artist_account_name,artist_id,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,...,respondent_id,age,gender,city,state,company,ethnicity,multi_ind,performance,workshop
0,Rob Aptaker,16.0,Theater,Assembly,2019-01-11,19-0008149,07645,Memorial Elem School-Montvale,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
1,Rob Aptaker,16.0,Theater,Assembly,2019-01-18,19-0007901,07930,Bragg Elem School,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
2,Rob Aptaker,16.0,Theater,Assembly,2019-01-25,19-0008152,07649,Oradell Public School,Program Services,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
3,Rob Aptaker,16.0,Theater,Assembly,2019-01-30,19-0007818,07661,Cherry Hill Elem School,Discount,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
4,Rob Aptaker,16.0,Theater,Assembly,2019-01-30,19-0007942,07661,Roosevelt School,Discount,Performance/Demo,...,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1310,"Segunda Quimbamba Folkloric Center, Inc.",229.0,Music,Professional Learning,2019-03-14,19-0008266,07501,Paterson School 15,CB-PNC,Workshop,...,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0,2.0,2
1311,"Segunda Quimbamba Folkloric Center, Inc.",229.0,Music,Professional Learning,2019-11-25,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Planning Meeting,...,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0,2.0,2
1312,"Segunda Quimbamba Folkloric Center, Inc.",229.0,Music,Professional Learning,2019-12-09,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Workshop,...,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0,2.0,2
1313,"Segunda Quimbamba Folkloric Center, Inc.",229.0,Music,Professional Learning,2019-12-10,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Workshop,...,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0,2.0,2


In [13]:
demo[(demo['Date']>=time_start) & (demo['Date']<time_end)]

Unnamed: 0,Artist Payroll ID,Artist: Account Name,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,...,Artist Fee,artist_id,respondent_id,age,gender,city,state,company,ethnicity,multi_ind
10,16.0,Rob Aptaker,Theater,Assembly,2019-01-11,19-0008149,07645,Memorial Elem School-Montvale,Program Services,Performance/Demo,...,292.00,16.0,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0
11,16.0,Rob Aptaker,Theater,Assembly,2019-01-18,19-0007901,07930,Bragg Elem School,Program Services,Performance/Demo,...,292.00,16.0,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0
12,16.0,Rob Aptaker,Theater,Assembly,2019-01-25,19-0008152,07649,Oradell Public School,Program Services,Performance/Demo,...,292.00,16.0,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0
13,16.0,Rob Aptaker,Theater,Assembly,2019-01-30,19-0007818,07661,Cherry Hill Elem School,Discount,Performance/Demo,...,278.00,16.0,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0
14,16.0,Rob Aptaker,Theater,Assembly,2019-01-30,19-0007942,07661,Roosevelt School,Discount,Performance/Demo,...,278.00,16.0,1.220051e+10,55 to 64,male,Allentown,PA,,caucasian,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2638,229.0,"Segunda Quimbamba Folkloric Center, Inc.",Music,Professional Learning,2019-03-14,19-0008266,07501,Paterson School 15,CB-PNC,Workshop,...,325.00,229.0,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0
2639,229.0,"Segunda Quimbamba Folkloric Center, Inc.",Music,Professional Learning,2019-11-25,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Planning Meeting,...,225.00,229.0,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0
2640,229.0,"Segunda Quimbamba Folkloric Center, Inc.",Music,Professional Learning,2019-12-09,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Workshop,...,324.99,229.0,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0
2641,229.0,"Segunda Quimbamba Folkloric Center, Inc.",Music,Professional Learning,2019-12-10,20-0009125,08618,Dr. Martin Luther King School,Creative Incubator,Workshop,...,324.99,229.0,1.222106e+10,55 to 64,female,Jersey City,NJ,"Segunda Quimbamba Folkloric Center, Inc.",latinx,1.0


In [None]:
import os
import sys
sys.path.append(os.path.dirname(os.getcwd())+'/utils')

In [None]:
path = os.path.dirname(os.path.dirname(os.getcwd()))+'/data/raw/'

In [None]:
#path = "/Users/jjgong/Desktop/ya-pay-equity/data/raw/"
# Use Glob to retrieve all of the file names
file_list = glob.glob(path+'*.xlsx')
# Sort files
file_list.sort()

In [None]:
file_list

## Data Upload

In [None]:
d = defaultdict(lambda: defaultdict())
for i,f in enumerate(file_list):
    d[file_list[i][len(path):-5]] = pd.read_excel(f)

In [None]:
d.keys()

In [None]:
pd.set_option("max_columns", 500)

## Three Year Sales Data

In [None]:
from helpers import clean_sales_data, group_sales_data

In [None]:
three_year = d['three_year_sales'].copy()
three_year = clean_sales_data(three_year)

In [None]:
three_yr_grp = group_sales_data(three_year)

In [None]:
#three_year[three_year['Artist Payroll ID']==95.3]

In [None]:
#[pd.isnull(three_year['Artist Payroll ID'])]

## Demographic Data

In [None]:
demographic = d['demographics']
dd = demographic.copy()
# Remove Unamed column
demographic = demographic.drop(labels=['Unnamed: 0'], axis=1)
# Drop duplicates
# # Repeated 133 
# demographic[demographic['artist_id']==133]
demographic = demographic.drop_duplicates(subset=['artist_id'], keep='last').reset_index(drop=True)

### Review unique artists

In [None]:
# # Retrieve unique artists
# unique_artists = three_yr_grp.drop_duplicates(subset=['Artist Payroll ID', 'Artist: Account Name'])[['Artist Payroll ID', 'Artist: Account Name']].reset_index(drop=True)

In [None]:
joined = three_yr_grp.merge(demographic, how='outer', left_on='Artist Payroll ID', right_on='artist_id')

In [None]:
joined

In [None]:
len(joined)

In [None]:
from config import REMOVE_ARTIST_IDS

In [None]:
included_df = joined[~joined['Artist Payroll ID'].isin(REMOVE_ARTIST_IDS)]

In [None]:
null_demo = included_df[pd.isnull(included_df['artist_id'])].copy().reset_index(drop=True)

In [None]:
demo = included_df[~(pd.isnull(included_df['artist_id'])) & ~(pd.isnull(included_df['Artist Payroll ID']))].copy().reset_index(drop=True)

In [None]:
null_demo[null_demo['Artist Payroll ID'].isin([26, 40, 75, 76, 85, 91, 93, 98, 101, 104, 114, 115])]

In [None]:
demo[demo['Artist Payroll ID'].isin([26, 40, 75, 76, 85, 91, 93, 98, 101, 104, 114, 115])]

In [None]:
len(null_demo)

In [None]:
len(demo)

In [None]:
len(demo)/(len(null_demo)+len(demo))

In [None]:
demo['Artist Payroll ID'].nunique()

In [None]:
len(demo)/(len(null_demo)+len(demo))

In [None]:
null_demo['Artist Payroll ID'].nunique()

## Insert Group Size

In [None]:
from helpers import clean_artist_count

In [None]:
grp_size = d['artist_count'].copy()
grp_size = clean_artist_count(grp_size)

In [None]:
# # Unique artists in 2019
# three_yr_grp[(three_yr_grp['Date']>=pd.Timestamp(2019,1,1)) & (three_yr_grp['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
# # Unique Artists in 2019 w/ Demo filled out
# demo[(demo['Date']>=pd.Timestamp(2019,1,1)) & (demo['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
join[(join['Date']>=pd.Timestamp(2019,1,1)) & (join['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
bla = demo.merge(grp_size, how='left', on='artist_id')

In [None]:
df_mer = demo.merge(grp_size, on='artist_id')

In [None]:
len(df_mer)

In [None]:
len(bla)

In [None]:
cols = ['artist_account_name', 'artist_id',
       'Art Form (General Discipline)', 'Contract Classification', 'Date',
       'Contract #', 'Client Zip Code', 'Client', 'Billing Code',
       'Component Type', 'Artist Fee', 'respondent_id', 'age', 'gender',
       'city', 'state', 'company', 'ethnicity', 'multi_ind', 'performance', 'workshop']

In [None]:
df_mer = df_mer[cols].copy()

In [None]:
df_mer['Component Type'].value_counts()

In [None]:
records = df_mer[(df_mer['Date']>=pd.Timestamp(2019,1,1)) & (df_mer['Date']<pd.Timestamp(2020,1,1))]

In [None]:
res_19 = records.groupby(['ethnicity','performance']).nunique()['artist_id'].reset_index()

In [None]:
res_19

In [None]:
res_19[res_19['performance']==1]

In [None]:
res_19['artist_id']

In [None]:
records['artist_id'].nunique()

In [None]:
df_ind = df_mer[df_mer['size']==1].copy()

In [None]:
new_cols = ['artist_account_name', 'artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 
       'respondent_id', 'age', 'gender', 'city', 'state', 
       'ethnicity', 'Artist Fee']

In [None]:
df_ind = df_ind[new_cols].copy()

## Regression Analysis

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
len(['artist_account_name', 'artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity'])

In [None]:
x = np.array(df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']]).reshape(-1,15)

In [None]:
y = np.array(df_ind['Artist Fee'])

In [None]:
model = LinearRegression()

In [None]:
pd.to_numeric(df_ind['Art Form (General Discipline)'], errors='coerce')

In [None]:
df_ind['Art Form (General Discipline)'].apply(pd.to_numeric)

In [None]:
df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']].apply(pd.to_numeric, errors='coerce')

In [None]:
x.apply(pd.to_numeric, errors='coerce')

In [None]:
model.fit(x, y)

In [None]:
#join[join['artist_id']==56]

In [None]:
p = three_year[pd.isnull(three_year['Artist Payroll ID'])]
p = three_year.drop_duplicates(['Artist Payroll ID', 'Artist: Account Name'])

In [None]:
demo[demo['age']=='75 or older']

In [None]:
demo.to_excel('demographics.xlsx')