# Young Audiences Data

In [4]:
import pandas as pd
import glob
from collections import defaultdict
from fuzzywuzzy import process, fuzz
#import matplotlib.pyplot as plt

In [14]:
import os

In [21]:
path = os.path.dirname(os.path.dirname(os.getcwd()))+'/data/raw/'

In [22]:
#path = "/Users/jjgong/Desktop/ya-pay-equity/data/raw/"
# Use Glob to retrieve all of the file names
file_list = glob.glob(path+'*.xlsx')
# Sort files
file_list.sort()

In [23]:
file_list

['/Users/jjgong/Desktop/ya-pay-equity/data/raw/Three Year Sales Report.xlsx',
 '/Users/jjgong/Desktop/ya-pay-equity/data/raw/artist_count_per_group.xlsx',
 '/Users/jjgong/Desktop/ya-pay-equity/data/raw/demographic_snapshot.xlsx']

## Data Upload

In [7]:
d = defaultdict(lambda: defaultdict())
for i,f in enumerate(file_list):
    d[i]['name'] = file_list[i][45:-5]
    d[i]['file'] = pd.read_excel(f)

In [8]:
# Get title and file
def get_name(d):
    print(d['name'])
    return d['file']

In [9]:
pd.set_option("max_columns", 500)

## Three Year Sales Data

In [10]:
three_year = get_name(d[0]).copy()
three_year.columns = [col.strip() for col in list(three_year.columns)]

Three Year Sales Report


In [11]:
# Mapping for IDs 
id_map = defaultdict()
for aid, an in zip(three_year['Artist Payroll ID'], three_year['Artist: Account Name']):
    if pd.isnull(aid)==False:
        id_map[an] = aid

In [12]:
# Map the unmapped artist IDs
three_year.loc[three_year[pd.isnull(three_year['Artist Payroll ID'])].index, ['Artist Payroll ID']] = three_year[pd.isnull(three_year['Artist Payroll ID'])]['Artist: Account Name'].apply(lambda x: d.get(x, float('NaN')))
# Literary Art to Literary Arts
three_year.loc[three_year[three_year['Art Form (General Discipline)']=='Literary Arts'].index, ['Art Form (General Discipline)']] = 'Literary Art'

In [None]:
#three_year.apply(lambda x: id_map.get(x['Artist: Account Name']) if pd.isnull(x['Artist Payroll ID']) else x, axis=1)

In [24]:
# Map missing names # Artist Payroll ID. Map some unmapped one
three_year['Artist Payroll ID'] = three_year.apply(lambda x: id_map.get(x['Artist: Account Name']) if pd.isnull(x['Artist Payroll ID']) else x, axis=1)['Artist Payroll ID']
#three_year['Artist Payroll ID'] = three_year.apply(lambda x: id_map.get(x['Artist: Account Name']) if pd.isnull(x['Artist Payroll ID']) else x, axis=1)['Artist: Account Name']

In [25]:
# Null Payroll IDs still
three_year[pd.isnull(three_year['Artist: Account Name'])]

Unnamed: 0,Artist Payroll ID,Artist: Account Name,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,Artist Fee,Sale Price,Artist Business name


In [26]:
three_year[pd.isnull(three_year['Artist Payroll ID'])]

Unnamed: 0,Artist Payroll ID,Artist: Account Name,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,Artist Fee,Sale Price,Artist Business name
2114,,Deborah Owens,,Admin/Project Dev,2021-03-15,21-0009889,8540,Young Audiences New Jersey,UWC YA Match,Teacher Stipend,200.0,200.0,Deborah Owens
2115,,Derling Dance Arts,Dance,Professional Learning,2019-02-06,19-0008302,8540,Young Audiences New Jersey,Dodge Dance,Professional Learning,350.0,500.0,Derlling Dance Arts
2116,,Dr. Ronah Harris,,Admin/Project Dev,2020-10-02,21-0009574,8540,Young Audiences New Jersey,Virtual Learning Pilot,Artist Stipend,500.0,500.0,Dr. Ronah Harris
9540,,Roxey Ballet,Dance,Professional Learning,2018-11-02,19-0008035,8540,Young Audiences New Jersey,Dodge Dance,Professional Learning,350.0,500.0,


In [27]:
# Removes unwanted indexes that have a NULL Artist Payroll ID
three_year = three_year[~three_year.index.isin(list(three_year[pd.isnull(three_year['Artist Payroll ID'])].index))].reset_index(drop=True)

In [28]:
three_year[pd.isnull(three_year['Artist Payroll ID'])]

Unnamed: 0,Artist Payroll ID,Artist: Account Name,Art Form (General Discipline),Contract Classification,Date,Contract #,Client Zip Code,Client,Billing Code,Component Type,Artist Fee,Sale Price,Artist Business name


In [31]:
demographic = get_name(d[2])
dd = demographic.copy()
# Remove Unamed column
demographic = demographic.drop(labels=['Unnamed: 0'], axis=1)
# Drop duplicates
demographic = demographic.drop_duplicates(subset=['artist_id'], keep='last').reset_index(drop=True)

demographic_snapshot


In [32]:
# Retrieve top 10 components
top_components = list(three_year['Component Type'].value_counts()[:9].index)
top_components.remove('Travel')
top_components.remove('Materials')
top_components.append('Virtual Planning Meeting')

In [None]:
# Group by
three_yr_grp = three_year.groupby(['Artist Payroll ID', 'Artist: Account Name', 'Art Form (General Discipline)', 'Contract Classification', 'Date', 'Contract #', 'Client Zip Code', 'Client', 'Billing Code', 'Component Type', 'Artist Business name']).sum()['Artist Fee'].reset_index().reset_index(drop=True)

In [None]:
# Retrieve only top performances. Remove other components
# # three_year_cnt = three_year[three_year['Component Type'].isin(top_components)].copy()

In [None]:
# Retrieve only top performances. Remove other components
three_yr_grp = three_yr_grp[three_yr_grp['Component Type'].isin(top_components)].copy()

In [None]:
len(three_yr_grp)

In [None]:
three_yr_grp

### Review unique artists

In [None]:
# Retrieve unique artists
unique_artists = three_yr_grp.drop_duplicates(subset=['Artist Payroll ID', 'Artist: Account Name'])[['Artist Payroll ID', 'Artist: Account Name']].reset_index(drop=True)

In [None]:
joined = three_yr_grp.merge(demographic, how='outer', left_on='Artist Payroll ID', right_on='artist_id')

In [None]:
len(joined)

In [None]:
# Remove certain artists
num_include = list(set(joined['Artist Payroll ID'])-set([95, 0, 20, 56]))

In [None]:
included_df = joined[joined['Artist Payroll ID'].isin(num_include)]

In [None]:
null_demo = included_df[pd.isnull(included_df['artist_id'])].copy().reset_index(drop=True)

In [None]:
demo = included_df[~pd.isnull(included_df['artist_id'])].copy().reset_index(drop=True)

In [None]:
len(null_demo)

In [None]:
len(demo)

In [None]:
len(demo)/(len(null_demo)+len(demo))

In [None]:
demo['Artist Payroll ID'].nunique()

In [None]:
len(demo)/(len(null_demo)+len(demo))

In [None]:
null_demo['Artist Payroll ID'].nunique()

#### Insert Group Size

In [None]:
d.keys()

In [None]:
grp_size = get_name(d[7]).copy()

In [None]:
three_yr_grp[(three_yr_grp['Date']>=pd.Timestamp(2019,1,1)) & (three_yr_grp['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
demo[(demo['Date']>=pd.Timestamp(2019,1,1)) & (demo['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
grp_size = grp_size[pd.isnull(grp_size['status'])].copy()

In [None]:
grp_size['artist_id'].value_counts()

In [None]:
grp_size

In [None]:
join = demo[~pd.isnull(demo['Artist Payroll ID'])].copy()

In [None]:
join[(join['Date']>=pd.Timestamp(2019,1,1)) & (join['Date']<pd.Timestamp(2020,1,1))]['Artist Payroll ID'].nunique()

In [None]:
artists_remove = ['Anndee Hochman', 'Oyin Hardy']
artist_id_remove = [95, 20 , 56]

In [None]:
grp_size = grp_size[~grp_size['artist_account_name'].isin(artists_remove)]

In [None]:
grp_size = grp_size[~grp_size['artist_id'].isin(artist_id_remove)]

In [None]:
bla = join.merge(grp_size, how='left', on='artist_id')

In [None]:
[int(x) for x in list(bla[pd.isnull(bla['size'])]['Artist Payroll ID'].unique())]

In [None]:
grp_size[grp_size['artist_id'].isin([26])]

In [None]:
bla[pd.isnull(bla['size'])]['Artist: Account Name'].unique()

In [None]:
df_mer = join.merge(grp_size, on='artist_id')

In [None]:
# df_mer[pd.isnull(df_mer['Artist Payroll ID'])]['artist_id']
# join[join['artist_id'].isin([0, 18, 19 , 21, 22, 28, 31, 33, 34, 35, 36, 37, 38, 49, 42, 43, 48, 49, 51, 53])]

In [None]:
cols = ['artist_account_name', 'artist_id',
       'Art Form (General Discipline)', 'Contract Classification', 'Date',
       'Contract #', 'Client Zip Code', 'Client', 'Billing Code',
       'Component Type', 'Artist Fee', 'respondent_id', 'age', 'gender', 
       'city', 'state', 'company', 'ethnicity', 'multi_ind', 'size']

In [None]:
df_mer = df_mer[cols].copy()

In [None]:
records = df_mer[(df_mer['Date']>=pd.Timestamp(2019,1,1)) & (df_mer['Date']<pd.Timestamp(2020,1,1))]

In [None]:
records.head()

In [None]:
res_19 = records.groupby(['ethnicity','size']).nunique()['artist_id'].reset_index()

In [None]:
res_19[res_19['size']==1]

In [None]:
res_19['artist_id']

In [None]:
records['artist_id'].nunique()

In [None]:
df_ind = df_mer[df_mer['size']==1].copy()

In [None]:
new_cols = ['artist_account_name', 'artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 
       'respondent_id', 'age', 'gender', 'city', 'state', 
       'ethnicity', 'Artist Fee']

In [None]:
df_ind = df_ind[new_cols].copy()

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
len(['artist_account_name', 'artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity'])

In [None]:
x = np.array(df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']]).reshape(-1,15)

In [None]:
y = np.array(df_ind['Artist Fee'])

In [None]:
model = LinearRegression()

In [None]:
pd.to_numeric(df_ind['Art Form (General Discipline)'], errors='coerce')

In [None]:
df_ind['Art Form (General Discipline)'].apply(pd.to_numeric)

In [None]:
df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']].apply(pd.to_numeric, errors='coerce')

In [None]:
x.apply(pd.to_numeric, errors='coerce')

In [None]:
model.fit(x, y)

In [None]:
#join[join['artist_id']==56]

In [None]:
p = three_year[pd.isnull(three_year['Artist Payroll ID'])]
p = three_year.drop_duplicates(['Artist Payroll ID', 'Artist: Account Name'])

In [None]:
demo[demo['age']=='75 or older']

In [None]:
demo.to_excel('demographics.xlsx')