In [1]:
import os
import glob
import pandas as pd
import matplotlib
import numpy as np
from numpy import NaN
import matplotlib.pyplot as plt

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,Second Growth Homes,123,,,,99801
1,All-Pro Contractors,129,5.0,276.0,,48226
2,Foster Exp.,129,5.0,49.0,42/hour,48226
3,B-Major Carpentry,129,4.5,12.0,75/hour,48226
4,Madison Property Services,129,4.5,12.0,80/hour,48226


In [3]:
df.shape

(3582, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([nan, '42/hour', '75/hour', '80/hour', 29.0, 35.0, 45.0, 40.0, 20.0,
       30.0, 36.0, '49/on-site', 100.0, 120.0, 130.0, 125.0, 50.0, 33.0,
       105.0], dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['hour', 'on-site']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    try:
        y = x[1]
    except:
        y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type":"ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID', how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
# See shape
df.shape

(3582, 9)

In [11]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments


In [12]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,3298.0,2374.0,1646.0,3582.0
mean,4.779867,56.416175,98.288578,56848.350363
std,0.439615,143.393842,223.657693,28817.978345
min,1.0,2.0,1.0,6103.0
25%,4.5,6.0,50.0,30303.0
50%,5.0,16.0,82.0,55404.0
75%,5.0,46.0,110.0,84111.0
max,5.0,2040.0,6000.0,99801.0


In [13]:
# See basic count
servicesCount = df.groupby("Services")["Services"].count().to_frame(name="Count").reset_index()
servicesCount = servicesCount.sort_values(by="Count", ascending=False)
servicesCount["Contribution"] = servicesCount["Count"] / servicesCount["Count"].sum()
servicesCount

Unnamed: 0,Services,Count,Contribution
6,Roof Installation or Replacement,956,0.26689
4,Insulation Installation or Upgrade,828,0.231156
3,Hot Tub and Spa Repair,596,0.166387
5,Lawn Mower Repair,445,0.124232
0,General Carpentry,396,0.110553
1,Gutter Repair,356,0.099386
2,Hot Tub and Spa Installation,5,0.001396


In [14]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

# Get sum of implied revenue by segment
dfRevenuBySegment = df.groupby("Segments")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuBySegment["Contribution"] = dfRevenuBySegment["Implied Revenue"] / dfRevenuBySegment["Implied Revenue"].sum()

# Get sum of implied revenue by services
dfRevenuByServices = df.groupby("Services")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuByServices["Contribution"] = dfRevenuByServices["Implied Revenue"] / dfRevenuByServices["Implied Revenue"].sum()

# Get sum of implied revenue by zipcodes
dfRevenuByRevenue = df.groupby("Zip Code")["Implied Revenue"].sum().to_frame(name="Implied Revenue").reset_index()
dfRevenuByRevenue["Contribution"] = dfRevenuByRevenue["Implied Revenue"] / dfRevenuByRevenue["Implied Revenue"].sum()

In [15]:
dfRevenuBySegment.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Segments,Implied Revenue,Contribution
0,Home Improvement,6596757.0,1.0


In [16]:
dfRevenuByServices.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Services,Implied Revenue,Contribution
3,Hot Tub and Spa Repair,2750469.0,0.416943
1,Gutter Repair,1570004.0,0.237996
5,Lawn Mower Repair,1187134.0,0.179957
4,Insulation Installation or Upgrade,633311.0,0.096003
6,Roof Installation or Replacement,434815.0,0.065913
0,General Carpentry,21024.0,0.003187
2,Hot Tub and Spa Installation,0.0,0.0


In [17]:
dfRevenuByRevenue.sort_values(by="Contribution", ascending=False)

Unnamed: 0,Zip Code,Implied Revenue,Contribution
69,94108,402290.0,0.060983
9,20001,360864.0,0.054703
54,75207,344924.0,0.052287
70,95112,319992.0,0.048507
55,77002,306766.0,0.046503
...,...,...,...
0,6103,0.0,0.000000
41,59623,0.0,0.000000
40,58501,0.0,0.000000
1,6604,0.0,0.000000


In [18]:
# Rearrange columsn
df = df[['Segments', 'Services', 'Service', 'Type', 'Hires', 'Price', 'Price Type', 'Rating', 'Zip Code', 'Implied Revenue']]

In [19]:
# See columns
print(df.columns.values)

['Segments' 'Services' 'Service' 'Type' 'Hires' 'Price' 'Price Type'
 'Rating' 'Zip Code' 'Implied Revenue']


In [20]:
df.to_csv("all_services.csv")