In [1]:
import os
import glob
import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,"Katie Beth Nutrition, Registered Dietitian",12,5.0,67.0,495.0,36104
1,Cassi Nunes,12,5.0,164.0,249.0,36104
2,Beau Gardner,12,5.0,142.0,100.0,36104
3,Nolan King's Nutrition and Training,12,5.0,744.0,199.0,36104
4,Weightloss For Busy Proffessionals,12,5.0,,,36104


In [3]:
df.shape

(28416, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([495.0, 249.0, 100.0, 199.0, nan, 170.0, 57.0, 125.0, 99.0, 35.0,
       175.0, 75.0, 80.0, 300.0, 200.0, 40.0, 85.0, 79.0, 120.0],
      dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['on-site', 'session', 'logo', 'hour', 'walk', 'cat', 'visit']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    x = i.split('/')
    tempPrice.append(x[0])
    try:
        y = x[1]
    except:
        y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type": "ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID',
                   how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
df.sample()

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments
3008,Sebsbest LLC.,5.0,3.0,40.0,53205,,Nutritionist,Main,Wellness


In [11]:
zipCodes = pd.read_csv('../src/us_states.csv')

dfMerge = df
dfMerge = pd.merge(dfMerge, zipCodes, left_on='Zip Code', right_on='Representative ZIP Code',
                   how='left').drop('Representative ZIP Code', axis=1)

df = dfMerge

In [12]:
# See shape
df.shape

(28416, 12)

In [13]:
# Check for duplicates
dfDuplicate = df[df.duplicated()]
dfDuplicate

# Deal with duplicates accordingly if there are any

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments,Capital City,State,Abbreviation
9775,Jnana Massage Therapy,5.0,,80.0,85001,,Massage Therapy,Main,Wellness,Phoenix,Arizona,AZ


In [14]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,27067.0,20342.0,20174.0,28416.0
mean,4.93,67.99,142.87,49827.88
std,0.24,144.1,143.98,31695.96
min,1.0,2.0,1.0,2201.0
25%,5.0,8.0,65.0,21202.0
50%,5.0,22.0,100.0,50309.0
75%,5.0,59.0,175.0,78701.0
max,5.0,2041.0,6000.0,99801.0


In [15]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

In [16]:
# See columns
print(df.columns.values)

['Service' 'Rating' 'Hires' 'Price' 'Zip Code' 'Price Type' 'Services'
 'Type' 'Segments' 'Capital City' 'State' 'Abbreviation' 'Implied Revenue']


In [17]:
# Rearrange columsn
df = df[[
    'Segments', 'Services', 'Type', 'Service', 'Hires', 'Price', 'Price Type',
    'Rating', 'Zip Code', 'Capital City', 'State', 'Abbreviation',
    'Implied Revenue'
]]

In [18]:
df.to_csv("../output/all_services.csv")