In [1]:
import os
import glob
import pandas as pd
import matplotlib
import numpy as np
from numpy import NaN
import matplotlib.pyplot as plt

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,"Katie Beth Nutrition, Registered Dietitian",12,5.0,67.0,495.0,36104
1,Cassi Nunes,12,5.0,164.0,249.0,36104
2,Beau Gardner,12,5.0,142.0,100.0,36104
3,Nolan King's Nutrition and Training,12,5.0,744.0,199.0,36104
4,Weightloss For Busy Proffessionals,12,5.0,,,36104


In [3]:
df.shape

(25977, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([495.0, 249.0, 100.0, 199.0, nan, 170.0, 57.0, 125.0, 99.0, 35.0,
       175.0, 75.0, 80.0, 300.0, 200.0, 40.0, 85.0, 79.0, 120.0],
      dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['on-site', 'session', 'logo', 'hour', 'walk', 'cat', 'visit']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    try:
        y = x[1]
    except:
        y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type": "ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID',
                   how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
# See shape
df.shape

(25977, 9)

In [11]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments
8815,Jnana Massage Therapy,5.0,,80.0,85001,,Massage Therapy,Main,Wellness


In [12]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,24693.0,18239.0,17999.0,25977.0
mean,4.936561,69.36263,131.802989,49991.748239
std,0.229574,149.49329,141.024769,31689.097766
min,1.0,2.0,1.0,2201.0
25%,5.0,8.0,60.0,21202.0
50%,5.0,22.0,99.0,50309.0
75%,5.0,59.0,150.0,78701.0
max,5.0,2041.0,6000.0,99801.0


In [13]:
# See basic count
servicesCount = df.groupby("Services")["Services"].count().to_frame(
    name="Count").reset_index()
servicesCount = servicesCount.sort_values(by="Count", ascending=False)
servicesCount[
    "Contribution"] = servicesCount["Count"] / servicesCount["Count"].sum()
servicesCount

Unnamed: 0,Services,Count,Contribution
21,Statistical Data Analysis,2730,0.105093
12,Life Coaching,2730,0.105093
16,Personal Training,2730,0.105093
15,Nutritionist,2730,0.105093
4,Computer Repair,2730,0.105093
13,Logo Design,2730,0.105093
0,Accounting,1372,0.052816
20,Roof Repair or Maintenance,1147,0.044154
2,Business Tax Preparation,1140,0.043885
10,Interior Design,993,0.038226


In [14]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

# Get sum of implied revenue by segment
dfRevenuBySegment = df.groupby("Segments")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuBySegment["Contribution"] = dfRevenuBySegment[
    "Implied Revenue"] / dfRevenuBySegment["Implied Revenue"].sum()

# Get sum of implied revenue by services
dfRevenuByServices = df.groupby("Services")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuByServices["Contribution"] = dfRevenuByServices[
    "Implied Revenue"] / dfRevenuByServices["Implied Revenue"].sum()

# Get sum of implied revenue by zipcodes
dfRevenuByRevenue = df.groupby("Zip Code")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuByRevenue["Contribution"] = dfRevenuByRevenue[
    "Implied Revenue"] / dfRevenuByRevenue["Implied Revenue"].sum()

In [15]:
dfRevenuBySegment.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Segments,Implied Revenue,Contribution
0,Business,112629562.0,0.61034
3,Wellness,64214887.0,0.347981
1,Home Improvement,7152116.0,0.038757
2,Pets,539208.0,0.002922


In [16]:
dfRevenuByServices.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Services,Implied Revenue,Contribution
13,Logo Design,83128150.0,0.450472
15,Nutritionist,31362101.0,0.169951
16,Personal Training,20585865.0,0.111555
4,Computer Repair,13555566.0,0.073458
2,Business Tax Preparation,13249306.0,0.071798
12,Life Coaching,8646424.0,0.046855
14,Massage Therapy,3042009.0,0.016485
9,House Cleaning,2896900.0,0.015698
0,Accounting,2157469.0,0.011691
8,Handyman,1702886.0,0.009228


In [17]:
dfRevenuByRevenue.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Zip Code,Implied Revenue,Contribution
79,90013,2609801.0,0.014143
81,94108,2606979.0,0.014127
70,80202,2528940.0,0.013704
13,10007,2508203.0,0.013592
10,6604,2457283.0,0.013316
...,...,...,...
43,48933,1709284.0,0.009263
3,3301,1701114.0,0.009218
88,98507,1698322.0,0.009203
50,57501,1669839.0,0.009049


In [18]:
# Rearrange columsn
df = df[[
    'Segments', 'Services', 'Service', 'Type', 'Hires', 'Price', 'Price Type',
    'Rating', 'Zip Code', 'Implied Revenue'
]]

In [19]:
# See columns
print(df.columns.values)

['Segments' 'Services' 'Service' 'Type' 'Hires' 'Price' 'Price Type'
 'Rating' 'Zip Code' 'Implied Revenue']


In [20]:
df.to_csv("all_services.csv")