In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Data Import

In [2]:
# create dataframe
project_hours = pd.read_csv('project_hours_scaled.csv')

# change date columns to datetime
project_hours[['fund_date', 'start_date', 'finish_date']] = project_hours[['fund_date', 'start_date', 'finish_date']].astype('datetime64[s]')

# Cast id columns as integers
project_hours[['client_id', 'project_id']] = project_hours[['client_id', 'project_id']].astype('int')

# Exploratory Analysis

## Classifying Top Clients and Ranking all Clients
I'd like to determine a list of my top clients based on their average hourly rates and total revenue.

In [3]:
# Filtering and Sorting Top Clients

# region Collecting Data

# Create hourly_ratio column based on each projects ratio of all hourly 
project_hours['hourly_ratio'] = project_hours['project_hourly'].apply(lambda x: x/project_hours['project_hourly'].sum())

# Create per_project_ratio column based on each projects ratio of income
project_hours['per_project_ratio'] = project_hours['total_after_fees'].apply(lambda x: x/project_hours['total_after_fees'].sum())

# Filter only finished projects that are time tracked
finished_tracked_projects = project_hours[(project_hours['status'].str.lower() == 'done')
                                            & (project_hours['spec_project'] == False)
                                            & (project_hours['time_tracked'] == True)]

# Aggregate function to group client_id and average their hourly and ratio of total per_project_ratio
client_ranking = finished_tracked_projects.groupby('client_id').agg(
    average_hourly_ratio=('hourly_ratio', 'mean'),  # Calculate the average of hourly_ratio
    total_per_project_ratio=('per_project_ratio', 'sum')  # Calculate the sum of per_project_ratio
).reset_index()

# Use the rename method to alias column names
client_ranking = client_ranking.rename(columns={
    'average_hourly_ratio': 'client_hourly_average',
    'total_per_project_ratio': 'client_project_revenue_ratio'
})

# endregion

# region Categorize Percentiles

# Set thresholds 
hourly_threshold_pct = 50
per_project_threshold_pct = 75

# Get 80th percentile values
percentile_value_hourly = np.percentile(client_ranking['client_hourly_average'], hourly_threshold_pct)
percentile_value_project = np.percentile(client_ranking['client_project_revenue_ratio'], per_project_threshold_pct)

# Function to assign percentile labels to hourly
def top_percentile_bool_hourly(value):
    if np.percentile(value, hourly_threshold_pct) >= percentile_value_hourly:
        return True
    else:
        return False

# Function to assign percentile labels to projects
def top_percentile_bool_project(value):
    if np.percentile(value, per_project_threshold_pct) >= percentile_value_project:
        return True
    else:
        return False

# Categorize top_hourly_clients
client_ranking['top_client_hourly'] = client_ranking['client_hourly_average'].apply(top_percentile_bool_hourly)

# Categorize top_per_project_clients
client_ranking['top_client_per_project'] = client_ranking['client_project_revenue_ratio'].apply(top_percentile_bool_project)

# Assign percentile values for overall ratings
percentiles = [25, 50, 75]
percentile_values_hourly = np.percentile(client_ranking['client_hourly_average'], percentiles)

# Function to assign percentile labels
def assign_percentile_label_hourly(value):
    percentile_x = np.percentile(value, hourly_threshold_pct)
    if percentile_x <= percentile_values_hourly[0]:
        return '25th Percentile'
    elif percentile_x <= percentile_values_hourly[1]:
        return '50th Percentile'
    elif percentile_x <= percentile_values_hourly[2]:
        return '75th Percentile'
    else:
        return 'Above 75th Percentile'
    
# Function to assign percentile labels
def assign_percentile_label_project(value):
    percentile_y = np.percentile(value, per_project_threshold_pct)
    if percentile_y <= percentile_values_project[0]:
        return '25th Percentile'
    elif percentile_y <= percentile_values_project[1]:
        return '50th Percentile'
    elif percentile_y <= percentile_values_project[2]:
        return '75th Percentile'
    else:
        return 'Above 75th Percentile'
    
client_ranking['hourly_avg_percentile'] = client_ranking['client_hourly_average'].apply(assign_percentile_label_hourly)

percentile_values_project = np.percentile(client_ranking['client_project_revenue_ratio'], percentiles)

client_ranking['per_project_percentile'] = client_ranking['client_project_revenue_ratio'].apply(assign_percentile_label_project)

# endregion

# region Organize dataframes

# Sort dataframe by ratios
client_ranking = client_ranking.sort_values(by=['client_project_revenue_ratio', 'client_hourly_average'], ascending=False, ignore_index=True)

# Create top_client column
client_ranking['top_client'] = client_ranking['top_client_hourly'] & client_ranking['top_client_per_project']

# Create dataframe of only top clients for dashboard
top_clients = client_ranking[client_ranking['top_client'] == True].reset_index(drop=True)
top_clients = top_clients[['client_id']]
top_clients['ranking'] = top_clients.index + 1
top_clients
# endregion


Unnamed: 0,client_id,ranking
0,18,1
1,14,2
2,72,3
3,6,4
4,51,5
5,40,6
6,37,7
7,41,8


In [9]:
# Merge to original dataframe
client_ranking_filtered = client_ranking[['client_id', 'hourly_avg_percentile', 'per_project_percentile', 'top_client']]
project_hours_ranked = pd.merge(project_hours, client_ranking_filtered, how='left', on='client_id')
project_hours_ranked.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype        
---  ------                     --------------  -----        
 0   fund_date                  119 non-null    datetime64[s]
 1   start_date                 145 non-null    datetime64[s]
 2   finish_date                207 non-null    datetime64[s]
 3   price                      119 non-null    float64      
 4   total_additional_payments  21 non-null     float64      
 5   expenses                   16 non-null     float64      
 6   total_after_fees           117 non-null    float64      
 7   total_hours                147 non-null    float64      
 8   status                     244 non-null    object       
 9   project_type               243 non-null    object       
 10  master_ownership           25 non-null     float64      
 11  song_ownership             116 non-null    float64      
 12  time_tracked          

## How likely is a Soundbetter Client to be experienced?

Null hypothesis: there is no association between a client being from Soundbetter and their experience level.

In [5]:
# Create crosstab
Xtab = pd.crosstab(project_hours['soundbetter'], project_hours['experienced_client'])

chi2, pval, dof, expected = chi2_contingency(Xtab)
print(f'The p-value is {pval}')

The p-value is 8.600057337373808e-27


In [6]:
Xtab = pd.crosstab(project_hours['soundbetter'], project_hours['experienced_client'], normalize = True).reset_index()
Xtab

experienced_client,soundbetter,False,True
0,False,0.078189,0.63786
1,True,0.234568,0.049383


In [7]:
# sns.scatterplot(x = 'experienced_client', y = 'soundbetter', data=project_hours)
# plt.show()

## Predicting Difficult Clients

### K Nearest Neighbor Classifier

In [8]:
# pre-processing the data

#make a copy of the dataframe

# desired columns
column_list = ['price','project_type', 'soundbetter', 'songwriter', 'experienced_client', 'backend_belief']
client_difficulty_data = project_hours_tracked_paid[column_list].copy()

# get labels array
client_difficulty_labels = np.array(project_hours_tracked_paid['difficult_client'])
# convert array to binary
client_difficulty_labels = client_difficulty_labels.astype(int)

#MinMax the 'price' column
price_reshaped = np.array(client_difficulty_data['price']).reshape(-1,1)
mmscaler = MinMaxScaler()

# add column of normalized price back to dataframe
client_difficulty_data['price_normalized'] = mmscaler.fit_transform(price_reshaped)



NameError: name 'project_hours_tracked_paid' is not defined

In [None]:
# one hot encode project type
ohe = pd.get_dummies(client_difficulty_data['project_type'])
client_difficulty_data = client_difficulty_data.join(ohe)

In [None]:
# cast boolean values as integers
columns_to_cast_as_int = ['soundbetter', 'experienced_client', 'backend_belief', 'additional production',
                          'full production', 'mastering', 'mixing', 'pre production', 'production tweak',
                          'track production', 'vocal production', 'songwriter']

client_difficulty_data[columns_to_cast_as_int] = client_difficulty_data[columns_to_cast_as_int].astype(int)

# drop extra columns

client_difficulty_data = client_difficulty_data.drop(['price', 'project_type'], axis=1)

In [None]:
# Split the data
training_data, validation_data, training_labels, validation_labels = train_test_split(client_difficulty_data, client_difficulty_labels, test_size=0.2, random_state=100)

print(len(training_data), len(training_labels)) # verify the length is the same

74 74


In [None]:
scores = [] # initiate blank list

for k in range(1,20): # loop through k from 1 to 100 and append the scores to the list
    classifier = KNeighborsClassifier(n_neighbors = k) # create a classifier with k neighbors
    classifier.fit(training_data, training_labels) # train the classifier
    scores.append(classifier.score(validation_data, validation_labels)) # determine the accuracy of the trained classifier

print(scores)

max_score = max(scores) # determine the max score
max_score_index = scores.index(max_score) # find the index of max score
print(max_score, max_score_index + 1) # add 1 to index to find k value for max score

''' Do I need more data for this model to be more effective? There are not a lot of data marked 'difficult client' which may be throwing it off'''

[0.7894736842105263, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.7894736842105263, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632, 0.8947368421052632]
0.8947368421052632 2


" Do I need more data for this model to be more effective? There are not a lot of data marked 'difficult client' which may be throwing it off"