## Importing Libraries

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 

In [2]:
pathname=r'/Users/alejandrocisneros/Desktop/Data Analysis Coursework/Data Immersion/Achievement 6/Dataset Analysis'

In [3]:
pathname

'/Users/alejandrocisneros/Desktop/Data Analysis Coursework/Data Immersion/Achievement 6/Dataset Analysis'

In [4]:
#Importing dataset
df=pd.read_csv(os.path.join(pathname, '2. Data', 'Original Data', 'unicorn.csv'), index_col=False)

### Basic Statistics

In [5]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,0,ByteDance,$140,4/7/2017,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,1,SpaceX,$127,12/1/2012,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,2,SHEIN,$100,7/3/2018,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,3,Stripe,$95,1/23/2014,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,4,Canva,$40,1/8/2018,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."
5,5,Checkout.com,$40,5/2/2019,United Kingdom,London,Fintech,"Tiger Global Management, Insight Partners, DST..."
6,6,Instacart,$39,12/30/2014,United States,San Francisco,"Supply chain, logistics, & delivery","Khosla Ventures, Kleiner Perkins Caufield & By..."
7,7,Databricks,$38,2/5/2019,United States,San Francisco,Data management & analytics,"Andreessen Horowitz, New Enterprise Associates..."
8,8,Revolut,$33,4/26/2018,United Kingdom,London,Fintech,"index Ventures, DST Global, Ribbit Capital"
9,9,Epic Games,$31.5,10/26/2018,United States,Cary,Other,"Tencent Holdings, KKR, Smash Ventures"


In [6]:
df.shape

(1201, 8)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1201 entries, 0 to 1200
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Unnamed: 0        1201 non-null   int64 
 1   Company           1201 non-null   object
 2   Valuation ($B)    1201 non-null   object
 3   Date Joined       1201 non-null   object
 4   Country           1201 non-null   object
 5   City              1183 non-null   object
 6   Industry          1201 non-null   object
 7   Select Investors  1200 non-null   object
dtypes: int64(1), object(7)
memory usage: 75.2+ KB


In [8]:
df.describe()

Unnamed: 0.1,Unnamed: 0
count,1201.0
mean,600.0
std,346.843144
min,0.0
25%,300.0
50%,600.0
75%,900.0
max,1200.0


In [9]:
df.dtypes

Unnamed: 0           int64
Company             object
Valuation ($B)      object
Date Joined         object
Country             object
City                object
Industry            object
Select Investors    object
dtype: object

## Data Wrangling

In [10]:
#Here, we will change the the 'Valuation($)' column by eliminating the '$' and then changing the data type to float64
df['Valuation ($B)']=df['Valuation ($B)'].str.replace('$', '')

In [11]:
#Now, we will change the data type to a float in order to be our 2nd continuous variable and show decimal values
df['Valuation ($B)']=df['Valuation ($B)'].astype(float)

In [12]:
df.dtypes

Unnamed: 0            int64
Company              object
Valuation ($B)      float64
Date Joined          object
Country              object
City                 object
Industry             object
Select Investors     object
dtype: object

In [13]:
#Now, we will change the data type for column 'Date Joined' from object to Date
df['Date Joined']=pd.to_datetime(df['Date Joined'])

In [14]:
#Now, we will go ahead and delete the 'Unnamed: 0' column, which is just a numeral index for the dataset.
df.drop(["Unnamed: 0"], axis=1, inplace=True)

In [15]:
#Here, we see the result via descriptive statistics
df.describe()

Unnamed: 0,Valuation ($B),Date Joined
count,1201.0,1201
mean,3.222981,2020-08-27 03:33:25.328892416
min,1.0,2011-04-02 00:00:00
25%,1.1,2019-10-16 00:00:00
50%,1.57,2021-05-18 00:00:00
75%,3.0,2021-11-28 00:00:00
max,140.0,2022-11-14 00:00:00
std,7.595045,


### Success!

### Now, we must focus on locating and dealing with any missing values in our data set

In [16]:
df.isnull().sum()

Company              0
Valuation ($B)       0
Date Joined          0
Country              0
City                18
Industry             0
Select Investors     1
dtype: int64

### Since we cannot impute locations, we must either delete the observations or label them as unknown. Given that these values account for less than 1% of the entire data set, we are okay with deleting them.

In [17]:
df.dropna(subset=['City'], inplace=True)

In [18]:
df.dropna(subset=['Select Investors'], inplace=True)

In [19]:
#TIme to re-check for null values
df.isnull().sum()

Company             0
Valuation ($B)      0
Date Joined         0
Country             0
City                0
Industry            0
Select Investors    0
dtype: int64

#### Success! All observations containing null values have been deleted from the data frame.

In [20]:
df.describe()

Unnamed: 0,Valuation ($B),Date Joined
count,1182.0,1182
mean,3.223257,2020-08-25 10:26:11.573604096
min,1.0,2011-04-02 00:00:00
25%,1.1,2019-10-08 12:00:00
50%,1.585,2021-05-15 00:00:00
75%,3.0,2021-11-29 00:00:00
max,140.0,2022-11-14 00:00:00
std,7.606725,


In [21]:
df

Unnamed: 0,Company,Valuation ($B),Date Joined,Country,City,Industry,Select Investors
0,ByteDance,140.0,2017-04-07,China,Beijing,Artificial intelligence,"Sequoia Capital China, SIG Asia Investments, S..."
1,SpaceX,127.0,2012-12-01,United States,Hawthorne,Other,"Founders Fund, Draper Fisher Jurvetson, Rothen..."
2,SHEIN,100.0,2018-07-03,China,Shenzhen,E-commerce & direct-to-consumer,"Tiger Global Management, Sequoia Capital China..."
3,Stripe,95.0,2014-01-23,United States,San Francisco,Fintech,"Khosla Ventures, LowercaseCapital, capitalG"
4,Canva,40.0,2018-01-08,Australia,Surry Hills,Internet software & services,"Sequoia Capital China, Blackbird Ventures, Mat..."
...,...,...,...,...,...,...,...
1196,LeadSquared,1.0,2022-06-21,India,Bengaluru,Internet software & services,"Gaja Capital Partners, Stakeboat Capital, West..."
1197,FourKites,1.0,2022-06-21,United States,Chicago,"Supply chain, logistics, & delivery","Hyde Park Venture Partners, Bain Capital Ventu..."
1198,VulcanForms,1.0,2022-07-05,United States,Burlington,"Supply chain, logistics, & delivery","Eclipse Ventures, D1 Capital Partners, Industr..."
1199,SingleStore,1.0,2022-07-12,United States,San Francisco,Data management & analytics,"Google Ventures, Accel, Data Collective"
