In [None]:
#Import dependencies 
import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 400)

Extract the Data

In [None]:
#Read the Shark Tank data into a dataframe
sharktank_df = pd.read_csv('shark_tank_raw_data/Shark Tank US dataset.csv')
sharktank_df.head()


Perform initial data cleaning

In [None]:
sharktank_df.info()

In [None]:
sharktank_df.columns

In [None]:
#remove unneccesary columns
sharktank_df_clean = sharktank_df[['Season Number', 'Startup Name', 'Episode Number', 'Pitch Number',
       'Original Air Date', 'Industry',
       'Business Description', 'Company Website', 'Pitchers Gender',
       'Pitchers City', 'Pitchers State', 'Entrepreneur Names', 'Multiple Entrepreneurs',
       'Original Ask Amount', 'Original Offered Equity', 'Valuation Requested',
       'Got Deal', 'Total Deal Amount', 'Total Deal Equity', 'Deal Valuation',
       'Number of Sharks in Deal', 'Investment Amount Per Shark',
       'Equity Per Shark', 'Barbara Corcoran Investment Amount',
       'Barbara Corcoran Investment Equity', 'Mark Cuban Investment Amount',
       'Mark Cuban Investment Equity', 'Lori Greiner Investment Amount',
       'Lori Greiner Investment Equity', 'Robert Herjavec Investment Amount',
       'Robert Herjavec Investment Equity', 'Daymond John Investment Amount',
       'Daymond John Investment Equity', 'Kevin O Leary Investment Amount',
       'Kevin O Leary Investment Equity', 'Guest Investment Amount',
       'Guest Investment Equity', 'Guest Name', 'Barbara Corcoran Present',
       'Mark Cuban Present', 'Lori Greiner Present', 'Robert Herjavec Present',
       'Daymond John Present', 'Kevin O Leary Present', 'Guest Present']]

sharktank_df_clean.head()

In [None]:
#transform dates to dt format
sharktank_df_clean['Original Air Date'] = pd.to_datetime(sharktank_df_clean['Original Air Date'], format='%d-%b-%y')


In [None]:
#check that dates converted correctly
sharktank_df_clean.head()

In [None]:
#transform 1 and 0 values to boolean
sharktank_df_clean[['Got Deal', 
                    'Barbara Corcoran Present', 
                    'Mark Cuban Present', 
                    'Lori Greiner Present', 
                    'Robert Herjavec Present', 
                    'Daymond John Present',
                    'Kevin O Leary Present', 
                    'Guest Present']] = sharktank_df_clean[['Got Deal',
                                                            'Barbara Corcoran Present', 
                                                            'Mark Cuban Present', 
                                                            'Lori Greiner Present', 
                                                            'Robert Herjavec Present', 
                                                            'Daymond John Present', 
                                                            'Kevin O Leary Present', 
                                                            'Guest Present']].astype('bool')

In [None]:
#check that values converted to boolean correctly
sharktank_df_clean.head()

Pitchers Demographics Table Cleaning (Anna)

In [None]:
#Create the pitcher demographics table from the cleaned sharktank df
#Reorder the column names
pitcher_demo_df = sharktank_df_clean[["Pitch Number",
                                    "Multiple Entrepreneurs", 
                                    "Entrepreneur Names",
                                    "Pitchers Gender",
                                    "Pitchers City",
                                    "Pitchers State",
                                    "Industry"]]

pitcher_demo_df.head()

In [None]:
#change the multiple entrepreneurs column to boolean
pitcher_demo_df["Multiple Entrepreneurs"] = pitcher_demo_df["Multiple Entrepreneurs"].astype('bool')

pitcher_demo_df.head()

In [None]:
# replace any null values in the Entrepreneur Names, Gender, City, State Columns to "unknown"
# sources: https://stackoverflow.com/questions/34913590/fillna-in-multiple-columns-in-place-in-python-pandas

replace_na_strings = pitcher_demo_df.select_dtypes(object).columns
pitcher_demo_df[replace_na_strings] = pitcher_demo_df[replace_na_strings].fillna("Unknown")


In [None]:
# Create a "Entrepreneur 1 Name" and "Entrepreneur 2 Name" column with the first and last names from the "name" column. 
# Only split columns which have a comma, or an "and" to signify multiple pitchers
# Code was generated with assistance from Bootcamp Spot Xpert Learning Assistant

for index, row in pitcher_demo_df.iterrows():
    if "," in row["Entrepreneur Names"]:
         names = row['Entrepreneur Names'].split(',')
         pitcher_demo_df.at[index, 'Entrepreneur 1 Name'] = names[0]
         pitcher_demo_df.at[index, 'Entrepreneur 2 Name'] = names[1] if len(names) > 1 else None

    elif "and" in row["Entrepreneur Names"]:
         names = row['Entrepreneur Names'].split('and ')
         pitcher_demo_df.at[index, 'Entrepreneur 1 Name'] = names[0]
         pitcher_demo_df.at[index, 'Entrepreneur 2 Name'] = names[1] if len(names) > 1 else None

    else: pitcher_demo_df.at[index, 'Entrepreneur 1 Name'] = pitcher_demo_df.at[index, 'Entrepreneur Names']


pitcher_demo_df.head()

In [None]:
#fix any instances where multiple enterpreneurs column is incorrectly classified
#Source: https://www.programiz.com/python-programming/pandas/handle-wrong-data

for row in pitcher_demo_df.index:
    multiple_check = pitcher_demo_df.loc[row, "Entrepreneur 2 Name"]
    if pd.isnull(multiple_check):
        pitcher_demo_df.loc[row, "Multiple Entrepreneurs"] = False
    else: 
        pitcher_demo_df.loc[row, "Multiple Entrepreneurs"] = True

pitcher_demo_df.head()


In [None]:
#reorder columns
pitcher_demo_df_cleaned = pitcher_demo_df[["Pitch Number", 
                                   "Multiple Entrepreneurs", 
                                   "Entrepreneur 1 Name",
                                   "Entrepreneur 2 Name",
                                    "Pitchers Gender",
                                    "Pitchers City",
                                    "Pitchers State",
                                    "Industry"]]

#rename columns to simplify
pitcher_demo_df_cleaned.rename(columns= {"Pitchers Gender":"Gender(s)",
                                 "Pitchers City": "City",
                                  "Pitchers State": "State"}, inplace =True)

pitcher_demo_df_cleaned.head()

In [None]:
#Check datatypes and null counts one last time before exporting to CSV
#Only Entrepreneur 2 Name should still have null values

pitcher_demo_df_cleaned.info()

In [None]:
pitcher_demo_df_cleaned.to_csv("pitch_demo.csv", encoding = 'utf8', index=False)

Pitchers Demographics Table (Data cleaning by Anna Bitzer)

A subset of columns from the sharktank_df_clean dataset were used to create a Pitchers Demographics table. Several data cleaning steps were taken to transform and prepare the data for storage in a SQL database.
 -  The "Multiple Entrepreneurs" column datatype was switched to boolean.
 -  Null values in all string columns (name, gender, city, state, industry) were replaced with "Unknown.
 -  The "Entrepreneur Names" column, which could contain multiple entrepreneurs, was split into new columns "Entrepreneur 1 Name" and "Entrepreneur 2 Name", splitting at a comma or the word "and". 
 -  Some instances of the "Multiple Entrepreneurs" column were found to be incorrect. They were corrected using a conditional that checked if a second entrepreneur was present in the "Entrepreneur 2 Name" column.
 -  Finally, the columns were reorderd and some were renamed.

The final dataframe was exported to csv, for upload into a SQL database with the other tables.

Sources Used: 
https://stackoverflow.com/questions/34913590/fillna-in-multiple-columns-in-place-in-python-pandas, https://www.programiz.com/python-programming/pandas/handle-wrong-data, and BootcampSpot Xpert Learning Assistant for formatting itterows code to split names.



Shark Demogrpahics Table Cleaning (Atul)

Pitch Info Table Cleaning (Tianyue)