In [1]:
import os
import glob
import pandas as pd

pd.options.display.float_format = '{:,.2f}'.format
pd.set_option('mode.chained_assignment', None)

In [2]:
# Combine all the CSV files
path = r'../data'
all_csv = glob.glob(path + "/*.csv")

temp = []
for csv in all_csv:
    df = pd.read_csv(csv, index_col=0, header=0, thousands=',')
    temp.append(df)

df = pd.concat(temp, axis=0, ignore_index=True)
df.head()

Unnamed: 0,Service,Type,Rating,Hires,Price,Zip Code
0,"Columbia Junior, Multiple Ivy Acceptances, ACT...",32,5.0,95.0,75/hour,84111
1,Amikka Test Prep (Ivy Grad + No Contracts),32,5.0,335.0,60/hour,84111
2,Harvard Grad/15 Yrs Exp: SAT/ACT/College Admis...,32,5.0,60.0,495/hour,84111
3,"My Education Connections, LLC",32,5.0,8.0,60/hour,84111
4,Authentic Tutoring Services,32,5.0,9.0,28/hour,84111


In [3]:
df.shape

(62068, 6)

In [4]:
# Check and fix dtype
df.dtypes

Service      object
Type          int64
Rating      float64
Hires       float64
Price        object
Zip Code      int64
dtype: object

In [5]:
# See some prices values
df['Price'].unique()[:19]

# Need to fix that hour thing

array(['75/hour', '60/hour', '495/hour', '28/hour', '50/hour', '65/hour',
       '45/hour', '125/hour', '35/hour', '40/hour', '70/hour', '39/hour',
       '55/hour', nan, '85/service', '50/service', '150-$200', '15/hour',
       '36/hour'], dtype=object)

In [6]:
# See different types of payment
df['Price'] = df['Price'].astype(str)

tempList = []
checkUnique = []
for i in df["Price"]:
    x = i.split('/')
    tempList.append(x)
    
for i in tempList:
    try:
        x = i[1]
        checkUnique.append(x)
    except:
        pass

checkUnique = list(dict.fromkeys(checkUnique))
checkUnique

['hour',
 'service',
 'session',
 'on-site',
 'logo',
 'walk',
 'cat',
 'person',
 'visit']

In [7]:
# Removed the hour into another column
tempPrice = []
tempPriceType = []
for i in df['Price']:
    if i.find("-") < 0: 
        x = i.split('/')
        tempPrice.append(x[0])
        try:
            y = x[1]
        except:
            y = ""
    else:
        x = i.split('/')
        x = x[0].split('-')
        tempPrice.append(x[0])
        try:
            y = x[1]
        except:
            y = ""
    tempPriceType.append(y)

# Add it to dataframe and check
df['Price'] = tempPrice
df['Price Type'] = tempPriceType
df['Price Type'] = df['Price Type'].astype(str)

# Convert price column to float
df['Price'] = df['Price'].str.replace(",","").astype(float)

In [8]:
# Double check data types
df.dtypes

Service        object
Type            int64
Rating        float64
Hires         float64
Price         float64
Zip Code        int64
Price Type     object
dtype: object

In [9]:
# Add the service type from database
serviceDf = pd.read_csv('../src/all_services.csv')

dfMerge = df.rename(columns={"Type": "ID"})
dfMerge = pd.merge(dfMerge, serviceDf, left_on='ID', right_on='ID',
                   how='left').drop('ID', axis=1)
df = dfMerge

In [10]:
zipCodes = pd.read_csv('../src/us_states.csv')

dfMerge = df
dfMerge = pd.merge(dfMerge, zipCodes, left_on='Zip Code', right_on='Representative ZIP Code',
                   how='left').drop('Representative ZIP Code', axis=1)

df = dfMerge

In [11]:
df.sample(5)

Unnamed: 0,Service,Rating,Hires,Price,Zip Code,Price Type,Services,Type,Segments,Capital City,State,Abbreviation
31312,Justin Browns Music,5.0,,28.0,32301,,Piano Lessons,Main,Lessons,Tallahassee,Florida,FL
60387,World Aquatic Biomes,5.0,,,21202,,Aquarium Services,Main,Pets,Baltimore,Maryland,MD
50967,Sunshine Entertainment,5.0,223.0,295.0,6103,,DJ,Main,Events,Hartford,Connecticut,CT
35367,"Prime 1 Productions, LLC",5.0,503.0,100.0,19901,,DJ,Main,Events,Dover,Delaware,DE
37742,TutorWithTheBest,5.0,216.0,,95112,,Test Prep Services,Peripheral,Lessons,San Jose,California,CA


In [12]:
# See shape
df.shape

(62068, 12)

In [13]:
# Check for duplicates
dfDuplicate = df[~df.duplicated("Service", keep='first')]
df = dfDuplicate

In [14]:
df.shape

(12958, 12)

In [15]:
# See overview of data
df.describe()

Unnamed: 0,Rating,Hires,Price,Zip Code
count,12086.0,9086.0,8294.0,12958.0
mean,4.89,44.48,118.6,52309.02
std,0.31,91.3,158.74,31873.63
min,1.0,2.0,1.0,2201.0
25%,5.0,6.0,50.0,23219.0
50%,5.0,16.0,83.0,53205.0
75%,5.0,45.0,145.0,84111.0
max,5.0,2041.0,6000.0,99801.0


In [16]:
# Get implied revenue
df["Implied Revenue"] = df["Price"] * df["Hires"]

In [17]:
# See columns
print(df.columns.values)

['Service' 'Rating' 'Hires' 'Price' 'Zip Code' 'Price Type' 'Services'
 'Type' 'Segments' 'Capital City' 'State' 'Abbreviation' 'Implied Revenue']


In [18]:
# Rearrange columsn
df = df[[
    'Segments', 'Services', 'Type', 'Service', 'Hires', 'Price', 'Price Type',
    'Rating', 'Zip Code', 'Capital City', 'State', 'Abbreviation',
    'Implied Revenue'
]]

In [19]:
df.to_csv("../output/all_services.csv")