# Young Audiences Data

In [1]:
# https://ipython.org/ipython-doc/3/config/extensions/autoreload.html
# Reload modules before executing user code
%load_ext autoreload
%autoreload 2

In [3]:
import os
import sys
import pandas as pd
import glob
from collections import defaultdict
sys.path.append(os.path.dirname(os.getcwd())+'/utils')

In [28]:
from helpers import clean_artist_count, clean_sales_data, group_sales_data, get_groupings
from config import REMOVE_ARTIST_IDS, COLS, CMP_FILT

In [6]:
# Retrieve any arguments if available
args = sys.argv # date

In [34]:

# # Retrieve times as strings
# time_start = pd.Timestamp(args[1]) if 1 < len(args) else pd.Timestamp(2019,1,1)
# time_end = pd.Timestamp(args[2]) if 2 < len(args) else pd.Timestamp(2020,1,1)

time_start = pd.Timestamp(2019,1,1)
time_end = pd.Timestamp(2021,1,1)

path = os.path.dirname(os.path.dirname(os.getcwd()))+'/data/raw/'

# Use Glob to retrieve all of the file names
file_list = glob.glob(path+'*.xlsx')

# Upload files
d = defaultdict(lambda: defaultdict())
for i,f in enumerate(file_list):
    d[file_list[i][len(path):-5]] = pd.read_excel(f)


# Retrieve Three Year Sales Data
three_year = d['three_year_sales'].copy()
# Clean data
three_year = clean_sales_data(three_year)
# Group sales data
three_yr_grp = group_sales_data(three_year)

# Retrieve demographic data
demographic = d['demographics'].copy()
# Remove Unamed column
demographic = demographic.drop(labels=['Unnamed: 0'], axis=1)
# Drop duplicates - Artist ID 133 is duplicated
demographic = demographic.drop_duplicates(subset=['artist_id'], keep='last').reset_index(drop=True)

# Join Sales and Demographic data
joined = three_yr_grp.merge(demographic, how='outer', left_on='Artist Payroll ID', right_on='artist_id')

# Include only wanted Artist IDS - Remove Artist ID: 0
inc_df = joined[~joined['Artist Payroll ID'].isin(REMOVE_ARTIST_IDS)]

# Sales data with no demographic data
null_demo = inc_df[pd.isnull(inc_df['artist_id'])].copy().reset_index(drop=True)

# Sales data with demographic data
demo = inc_df[~(pd.isnull(inc_df['artist_id'])) & ~(pd.isnull(inc_df['Artist Payroll ID']))].copy().reset_index(drop=True)

# Retrieve artist group size data
grp_size = d['artist_count'].copy()
grp_size = clean_artist_count(grp_size)

# Filter data by date
df = demo[(demo['Date']>=time_start) & (demo['Date']<time_end)].copy().reset_index(drop=True)

# Merge demo and sales data with group size data
df_merged = df.merge(grp_size, on='artist_id')

df_merged = df_merged[COLS]

Number of NULL Artist IDs removed: 4


In [35]:
# Retrieve dictionary of dataframes of different groupings
d = get_groupings(df_merged)

In [36]:
# Merged data, non grouped
d['merged_all'] = df_merged
d['merged_demo'] = demo
d['merged_no_demo'] = null_demo

In [38]:
d['Workshop_ind'].groupby(['ethnicity','performance']).nunique()['artist_id'].reset_index()

Unnamed: 0,ethnicity,performance,artist_id
0,african,1.0,3
1,african,2.0,1
2,african,4.0,1
3,asian,1.0,2
4,asian,2.0,1
5,caucasian,1.0,19
6,caucasian,4.0,1
7,latinx,1.0,4


In [33]:
d['Performance_ind'].groupby(['ethnicity','performance']).nunique()['artist_id'].reset_index()

Unnamed: 0,ethnicity,performance,artist_id
0,african,1.0,3
1,asian,1.0,1
2,caucasian,1.0,8
3,latinx,1.0,3


In [None]:
len(null_demo)

In [None]:
len(demo)

In [None]:
len(demo)/(len(null_demo)+len(demo))

In [None]:
res_19 = records.groupby(['ethnicity','performance']).nunique()['artist_id'].reset_index()

In [None]:
res_19[res_19['performance']==1]

In [None]:
res_19['artist_id']

In [None]:
records['artist_id'].nunique()

## Regression Analysis

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
len(['artist_account_name', 'artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity'])

In [None]:
x = np.array(df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']]).reshape(-1,15)

In [None]:
y = np.array(df_ind['Artist Fee'])

In [None]:
model = LinearRegression()

In [None]:
pd.to_numeric(df_ind['Art Form (General Discipline)'], errors='coerce')

In [None]:
df_ind['Art Form (General Discipline)'].apply(pd.to_numeric)

In [None]:
df_ind.loc[:,['artist_id', 'Art Form (General Discipline)',
       'Contract Classification', 'Date', 'Contract #', 'Client Zip Code',
       'Client', 'Billing Code', 'Component Type', 'respondent_id', 'age',
       'gender', 'city', 'state', 'ethnicity']].apply(pd.to_numeric, errors='coerce')

In [None]:
x.apply(pd.to_numeric, errors='coerce')

In [None]:
model.fit(x, y)

In [None]:
#join[join['artist_id']==56]

In [None]:
p = three_year[pd.isnull(three_year['Artist Payroll ID'])]
p = three_year.drop_duplicates(['Artist Payroll ID', 'Artist: Account Name'])

In [None]:
demo[demo['age']=='75 or older']

In [None]:
demo.to_excel('demographics.xlsx')