In [1]:
import os
import glob
import pandas as pd
import matplotlib
import numpy as np
from numpy import NaN
import matplotlib.pyplot as plt

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,Second Growth Homes,123,,,,99801
1,All-Pro Contractors,129,5.0,276.0,,48226
2,Foster Exp.,129,5.0,49.0,42/hour,48226
3,B-Major Carpentry,129,4.5,12.0,75/hour,48226
4,Madison Property Services,129,4.5,12.0,80/hour,48226


In [3]:
df.shape

(4059, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([nan, '42/hour', '75/hour', '80/hour', 29.0, 35.0, 45.0, 40.0, 20.0,
       30.0, 36.0, '49/on-site', '100/consult', '55/consult', 100.0,
       120.0, 130.0, 125.0, 50.0], dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['hour', 'on-site', 'consult']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    try:
        y = x[1]
    except:
        y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type": "ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID',
                   how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
# See shape
df.shape

(4059, 9)

In [11]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments


In [12]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,3762.0,2769.0,1704.0,4059.0
mean,4.782961,56.390033,97.628521,56799.934713
std,0.432967,137.418621,220.011663,29242.842892
min,1.0,2.0,1.0,2201.0
25%,4.5,6.0,50.0,30303.0
50%,5.0,17.0,80.0,55404.0
75%,5.0,48.0,109.25,84111.0
max,5.0,2040.0,6000.0,99801.0


In [13]:
# See basic count
servicesCount = df.groupby("Services")["Services"].count().to_frame(
    name="Count").reset_index()
servicesCount = servicesCount.sort_values(by="Count", ascending=False)
servicesCount[
    "Contribution"] = servicesCount["Count"] / servicesCount["Count"].sum()
servicesCount

Unnamed: 0,Services,Count,Contribution
9,Roof Installation or Replacement,956,0.235526
6,Insulation Installation or Upgrade,828,0.203991
4,Hot Tub and Spa Repair,596,0.146834
8,Lawn Mower Repair,445,0.109633
0,General Carpentry,396,0.097561
2,Gutter Repair,356,0.087706
5,In-Ground Swimming Pool Construction,247,0.060852
1,Gutter Installation or Replacement,127,0.031288
3,Hot Tub and Spa Installation,70,0.017246
7,Land Surveying,38,0.009362


In [14]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

# Get sum of implied revenue by segment
dfRevenuBySegment = df.groupby("Segments")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuBySegment["Contribution"] = dfRevenuBySegment[
    "Implied Revenue"] / dfRevenuBySegment["Implied Revenue"].sum()

# Get sum of implied revenue by services
dfRevenuByServices = df.groupby("Services")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuByServices["Contribution"] = dfRevenuByServices[
    "Implied Revenue"] / dfRevenuByServices["Implied Revenue"].sum()

# Get sum of implied revenue by zipcodes
dfRevenuByRevenue = df.groupby("Zip Code")["Implied Revenue"].sum().to_frame(
    name="Implied Revenue").reset_index()
dfRevenuByRevenue["Contribution"] = dfRevenuByRevenue[
    "Implied Revenue"] / dfRevenuByRevenue["Implied Revenue"].sum()

In [15]:
dfRevenuBySegment.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Segments,Implied Revenue,Contribution
0,Home Improvement,6810922.0,1.0


In [16]:
dfRevenuByServices.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Services,Implied Revenue,Contribution
4,Hot Tub and Spa Repair,2750469.0,0.403832
2,Gutter Repair,1570004.0,0.230513
8,Lawn Mower Repair,1187134.0,0.174299
6,Insulation Installation or Upgrade,633311.0,0.092985
9,Roof Installation or Replacement,434815.0,0.063841
3,Hot Tub and Spa Installation,108995.0,0.016003
7,Land Surveying,105170.0,0.015441
0,General Carpentry,21024.0,0.003087
1,Gutter Installation or Replacement,0.0,0.0
5,In-Ground Swimming Pool Construction,0.0,0.0


In [17]:
dfRevenuByRevenue.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Zip Code,Implied Revenue,Contribution
79,94108,415415.0,0.060992
19,20001,361034.0,0.053008
64,75207,344924.0,0.050643
80,95112,333117.0,0.048909
65,77002,306766.0,0.045040
...,...,...,...
5,4330,0.0,0.000000
4,4101,0.0,0.000000
3,3301,0.0,0.000000
2,3108,0.0,0.000000


In [18]:
# Rearrange columsn
df = df[[
    'Segments', 'Services', 'Service', 'Type', 'Hires', 'Price', 'Price Type',
    'Rating', 'Zip Code', 'Implied Revenue'
]]

In [19]:
# See columns
print(df.columns.values)

['Segments' 'Services' 'Service' 'Type' 'Hires' 'Price' 'Price Type'
 'Rating' 'Zip Code' 'Implied Revenue']


In [20]:
df.to_csv("all_services.csv")