In [1]:
import os
import glob
import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('mode.chained_assignment', None)

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,"MSF Landscapes, Masonry, and Irrigation",47,4.5,49.0,,2903
1,Emi Ferreira Masonry,47,5.0,72.0,,2903
2,27 Newman St,47,5.0,,,2903
3,JFLANDSCAPING,47,5.0,,,2903
4,Leaves land scape inc.,47,3.5,4.0,,2903


In [3]:
df.shape

(71362, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array([nan, '75/hour', '60/hour', '495/hour', '28/hour', '50/hour',
       '65/hour', '45/hour', '125/hour', '35/hour', '40/hour', '70/hour',
       '39/hour', '55/hour', '85/service', '50/service', '150-$200',
       '15/hour', '36/hour'], dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['hour',
 'service',
 'session',
 'on-site',
 'logo',
 'lesson',
 'consult',
 'walk',
 'cat',
 'person',
 'visit']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    if i.find("-") < 0: 
        x = i.split('/')
        tempPrice.append(x[0])
        try:
            y = x[1]
        except:
            y = ""
    else:
        x = i.split('/')
        x = x[0].split('-')
        tempPrice.append(x[0])
        try:
            y = x[1]
        except:
            y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type": "ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID',
                   how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
zipCodes = pd.read_csv('../src/us_states.csv')

dfMerge = df
dfMerge = pd.merge(dfMerge, zipCodes, left_on='Zip Code', right_on='Representative ZIP Code',
                   how='left').drop('Representative ZIP Code', axis=1)

df = dfMerge

In [11]:
df.sample(5)

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments,Capital City,State,Abbreviation
60404,TutorWithTheBest,5.0,216.0,,55404,,Voice Over Lessons,Peripheral,Lessons,Minneapolis,Minnesota,MN
30160,Suzanna Griffith Music,5.0,8.0,75.0,72201,hour,Voice Over Lessons,Peripheral,Lessons,Little Rock,Arkansas,AR
39137,Harlan Hopchik,4.5,124.0,50.0,75207,hour,Private Tennis Instruction,Peripheral,Lessons,Dallas,Texas,TX
6510,Angel's Cleaning Utah,4.5,388.0,,84111,,House Cleaning,Main,Home Improvement,Salt Lake City,Utah,UT
62530,Janice Hooker Fortman,5.0,,75.0,36104,lesson,Public Speaking Lessons,Peripheral,Lessons,Montgomery,Alabama,AL


In [12]:
# See shape
df.shape

(71362, 12)

In [13]:
# Check for duplicates
dfDuplicate = df[~df.duplicated("Service", keep='first')]
df = dfDuplicate

In [14]:
df.shape

(15604, 12)

In [15]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,14610.0,11157.0,9343.0,15604.0
mean,4.89,47.03,128.15,52271.8
std,0.31,97.14,244.69,31891.03
min,1.0,2.0,1.0,2201.0
25%,5.0,6.0,53.5,23219.0
50%,5.0,17.0,80.0,53205.0
75%,5.0,47.0,140.0,83804.25
max,5.0,2041.0,7800.0,99801.0


In [16]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

In [17]:
# See columns
print(df.columns.values)

['Service' 'Rating' 'Hires' 'Price' 'Zip Code' 'Price Type' 'Services'
 'Type' 'Segments' 'Capital City' 'State' 'Abbreviation' 'Implied Revenue']


In [18]:
# Rearrange columsn
df = df[[
    'Segments', 'Services', 'Type', 'Service', 'Hires', 'Price', 'Price Type',
    'Rating', 'Zip Code', 'Capital City', 'State', 'Abbreviation',
    'Implied Revenue'
]]

In [19]:
df.to_csv("../output/all_services.csv")