In [1]:
import os
import glob
import pandas as pd
import matplotlib
import numpy as np
from numpy import NaN
import matplotlib.pyplot as plt

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,All-Pro Contractors,129,5.0,276.0,,48226
1,Foster Exp.,129,5.0,49.0,42/hour,48226
2,B-Major Carpentry,129,4.5,12.0,75/hour,48226
3,Madison Property Services,129,4.5,12.0,80/hour,48226
4,LawnStarter Lawn Care,160,4.0,362.0,29.0,50309


In [3]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [4]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([nan, '42/hour', '75/hour', '80/hour', 29.0, 35.0, 45.0, 40.0, 20.0,
       30.0, 36.0, 100.0, 120.0, 130.0, 125.0, 50.0, 33.0, 105.0, 80.0],
      dtype=object)

In [5]:
# Convert price to string
df['Price'] = df['Price'].astype(str)

# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    if i.find('hour') > 0:
        x = 'hourly'
    else:
        x = NaN
    tempPriceType.append(x)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType

# Convert price column to float
df['Price'] = df['Price'].astype(float)

In [6]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [7]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type":"ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID', how='left').drop('ID', axis=1)
df = dfMerge

In [8]:
# See shape
df.shape

(2261, 9)

In [9]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments


In [10]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,2046.0,1506.0,1537.0,2261.0
mean,4.789345,70.660027,86.070917,55896.628925
std,0.435698,172.822048,55.771631,28576.835538
min,1.0,2.0,1.0,10007.0
25%,4.5,7.0,50.0,30303.0
50%,5.0,19.0,82.0,55102.0
75%,5.0,53.0,102.0,83702.0
max,5.0,2040.0,1500.0,99801.0


In [11]:
# See basic count
servicesCount = df.groupby("Services")["Services"].count().to_frame(name="Count").reset_index()
servicesCount.sort_values(by="Count", ascending=False)

Unnamed: 0,Services,Count
2,Insulation Installation or Upgrade,828
1,Hot Tub and Spa Repair,596
3,Lawn Mower Repair,445
0,Gutter Repair,356
4,Roof Installation or Replacement,36


In [12]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

# Get sum of implied revenue by segment
dfRevenuBySegment = df.groupby("Segments")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuBySegment["Contribution"] = dfRevenuBySegment["Implied Revenue"] / dfRevenuBySegment["Implied Revenue"].sum()

# Get sum of implied revenue by services
dfRevenuByServices = df.groupby("Services")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuByServices["Contribution"] = dfRevenuByServices["Implied Revenue"] / dfRevenuByServices["Implied Revenue"].sum()

# Get sum of implied revenue by zipcodes
dfRevenuByRevenue = df.groupby("Zip Code")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuByRevenue["Contribution"] = dfRevenuByRevenue["Implied Revenue"] / dfRevenuByRevenue["Implied Revenue"].sum()

In [13]:
dfRevenuBySegment.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Segments,Implied Revenue,Contribution
0,Home Improvement,6140918.0,1.0


In [14]:
dfRevenuByServices.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Services,Implied Revenue,Contribution
1,Hot Tub and Spa Repair,2750469.0,0.447892
0,Gutter Repair,1570004.0,0.255663
3,Lawn Mower Repair,1187134.0,0.193315
2,Insulation Installation or Upgrade,633311.0,0.10313
4,Roof Installation or Replacement,0.0,0.0


In [15]:
dfRevenuByRevenue.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Zip Code,Implied Revenue,Contribution
67,94108,399690.0,0.065086
7,20001,351988.0,0.057318
52,75207,337763.0,0.055002
68,95112,310742.0,0.050602
53,77002,302216.0,0.049213
...,...,...,...
36,57103,1620.0,0.000264
37,58102,1204.0,0.000196
38,58501,0.0,0.000000
39,59623,0.0,0.000000


In [16]:
df.to_csv("all_services.csv")