In [1]:
import os
import glob
import pandas as pd
import matplotlib
import numpy as np
from numpy import NaN
import matplotlib.pyplot as plt

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,LawnStarter Lawn Care,160,4.0,161.0,29.0,92101
1,JF Landscaping,160,4.5,65.0,35.0,92101
2,CJ’s landscaping,160,5.0,82.0,,92101
3,Fuentes Services,160,5.0,48.0,45.0,92101
4,MP Construction,160,5.0,92.0,,92101


In [3]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [4]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([29.0, 35.0, nan, 45.0, 100.0, 60.0, 140.0, 120.0, 25.0, '75/hour',
       '85/hour', '80/hour', '100/hour', '95/hour', '125/hour', '60/hour',
       '135/hour', '165/hour', '70/hour'], dtype=object)

In [5]:
# Convert price to string
df['Price'] = df['Price'].astype(str)

# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    if i.find('hour') > 0:
        x = 'hourly'
    else:
        x = NaN
    tempPriceType.append(x)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType

# Convert price column to float
df['Price'] = df['Price'].astype(float)

In [6]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [7]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type":"ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID', how='left').drop('ID', axis=1)
df = dfMerge

In [8]:
# See shape
df.shape

(234, 9)

In [9]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments


In [10]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,215.0,158.0,163.0,234.0
mean,4.82093,51.917722,142.386503,81780.735043
std,0.489878,93.233968,178.771229,21172.445519
min,1.0,2.0,1.0,35210.0
25%,5.0,6.0,75.0,85001.0
50%,5.0,20.0,95.0,90013.0
75%,5.0,60.75,131.5,92101.0
max,5.0,848.0,1295.0,99501.0


In [11]:
# See basic count
df.groupby("Services")["Services"].count().to_frame(name="Count").reset_index()

Unnamed: 0,Services,Count
0,Custom Airbrushing,49
1,Gutter Repair,8
2,Hot Tub and Spa Repair,48
3,Insulation Installation or Upgrade,89
4,Lawn Mower Repair,40


In [12]:
df

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments
0,LawnStarter Lawn Care,4.0,161.0,29.0,92101,,Lawn Mower Repair,Peripheral,Home Improvement
1,JF Landscaping,4.5,65.0,35.0,92101,,Lawn Mower Repair,Peripheral,Home Improvement
2,CJ’s landscaping,5.0,82.0,,92101,,Lawn Mower Repair,Peripheral,Home Improvement
3,Fuentes Services,5.0,48.0,45.0,92101,,Lawn Mower Repair,Peripheral,Home Improvement
4,MP Construction,5.0,92.0,,92101,,Lawn Mower Repair,Peripheral,Home Improvement
...,...,...,...,...,...,...,...,...,...
229,Karen Hernandez,,,,90013,,Hot Tub and Spa Repair,Peripheral,Home Improvement
230,Matthew Environmental Janitorial,5.0,7.0,80.0,90013,,Hot Tub and Spa Repair,Peripheral,Home Improvement
231,house Cleaning,4.5,5.0,90.0,90013,,Hot Tub and Spa Repair,Peripheral,Home Improvement
232,Animal House Janitorial and Maintenance,4.5,89.0,40.0,99501,,Lawn Mower Repair,Peripheral,Home Improvement


In [13]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

# Get sum of implied revenue by segment
dfRevenuBySegment = df.groupby("Segments")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuBySegment["Contribution"] = dfRevenuBySegment["Implied Revenue"] / dfRevenuBySegment["Implied Revenue"].sum()

# Get sum of implied revenue by services
dfRevenuByServices = df.groupby("Services")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuByServices["Contribution"] = dfRevenuByServices["Implied Revenue"] / dfRevenuByServices["Implied Revenue"].sum()

In [14]:
dfRevenuBySegment.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Segments,Implied Revenue,Contribution
0,Events,613460.0,0.660939
1,Home Improvement,314705.0,0.339061


In [15]:
dfRevenuByServices.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Services,Implied Revenue,Contribution
0,Custom Airbrushing,613460.0,0.660939
2,Hot Tub and Spa Repair,177033.0,0.190734
3,Insulation Installation or Upgrade,69005.0,0.074346
4,Lawn Mower Repair,60647.0,0.065341
1,Gutter Repair,8020.0,0.008641
