In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in the raw data from the github url

df = pd.read_csv("https://raw.githubusercontent.com/suneel0101/lesson-plan/master/crunchbase_monthly_export.csv")

#### 1. Clean the data. Use .apply and lambda functions where you can

In [3]:
# rename the columns to remove spaces and extra characters
print list(df.columns)

['permalink', 'name', 'homepage_url', 'category_list', ' market ', ' funding_total_usd ', 'status', 'country_code', 'state_code', 'region', 'city', 'funding_rounds', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'first_funding_at', 'last_funding_at', 'Unnamed: 18']


In [4]:
new_cols = ['permalink', 'name', 'homepage_url', 'category_list', 'market', 'funding_total_usd', 'status', 'country_code', 'state_code', 'region', 'city', 'funding_rounds', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'first_funding_at', 'last_funding_at', 'Unnamed: 18']
df.columns = new_cols

In [5]:
# convert the funding, year founded, and funding_rounds columns to numeric values
custom_null = df['funding_total_usd'][9]
custom_null

' -   '

In [6]:
df['funding_total_usd'].replace(custom_null, np.nan, inplace=True)

def _dollars_to_int(x):
    try:
        y = (str(x)).replace(',','')
        return int(y)
    except:
        pass
        
df['funding_total_usd'] = df['funding_total_usd'].apply(_dollars_to_int)

In [7]:
# convert category_list so that each cell is a list of categories
df.category_list = df.category_list.apply(lambda x: (str(x)).split('|')[1:-1])

In [8]:
# use value_counts on market, status, and country_code to check for any repeats or misnamed entries


In [9]:
# BONUS: convert the date columns to appropriate datetime objects
%time df.founded_at = df.founded_at.apply(lambda x: pd.to_datetime(x))
%time df.founded_at = df.founded_at.apply(lambda x: pd.to_datetime(x))
%time df.founded_month = df.founded_month.apply(lambda x: pd.Period(x, freq= "M"))
%time df.founded_quarter = df.founded_quarter.apply(lambda x: pd.Period(x, freq= "Q"))

def _convert_time(x):
    try:
        return pd.to_datetime(x)
    except:
        return np.nan

%time df.last_funding_at = df.last_funding_at.apply(_convert_time)
%time df.first_funding_at = df.first_funding_at.apply(_convert_time)


CPU times: user 16.3 s, sys: 228 ms, total: 16.5 s
Wall time: 16.7 s
CPU times: user 1.44 s, sys: 121 ms, total: 1.56 s
Wall time: 1.56 s
CPU times: user 4.73 s, sys: 95.9 ms, total: 4.83 s
Wall time: 5.01 s
CPU times: user 3.4 s, sys: 88.1 ms, total: 3.49 s
Wall time: 3.49 s
CPU times: user 19.7 s, sys: 309 ms, total: 20.1 s
Wall time: 20.3 s
CPU times: user 19.6 s, sys: 222 ms, total: 19.8 s
Wall time: 20.1 s


#### 2. Using isnull(), take a subset of the dataframe that has no founded at date

In [10]:
df[df.founded_at.isnull()].head(2)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18
0,/organization/canal-do-credito,Canal do Credito,http://www.canaldocredito.com.br,"[Credit, Technology, Services, Finance]",Credit,750000.0,,BRA,,Rio de Janeiro,Belo Horizonte,1,NaT,NaT,NaT,,2010-01-01,2010-01-01,
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,[Games],Games,4000000.0,operating,USA,CA,Los Angeles,Los Angeles,2,NaT,NaT,NaT,,2010-06-04,2010-09-23,


#### 3. Create a new column, founded_year_x, that's a copy of founded_year. Then, replace all the missing values with either the mean , mode, or median of that column. 
Add a note explaining which statistic you picked and why:

In [11]:
df['founded_year_x'] = df['founded_year'].fillna(np.median)

#### 4. Take a subset of US-based companies using boolean selection

In [12]:
df[df.country_code == 'USA'].head(2)

Unnamed: 0,permalink,name,homepage_url,category_list,market,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,founded_month,founded_quarter,founded_year,first_funding_at,last_funding_at,Unnamed: 18,founded_year_x
1,/organization/waywire,#waywire,http://www.waywire.com,"[Entertainment, Politics, Social Media, News]",Entertainment,1750000.0,acquired,USA,NY,New York City,New York,1,2012-06-01,2012-06,2012Q2,2012.0,2012-06-30,2012-06-30,,2012
2,/organization/tv-communications,&TV Communications,http://enjoyandtv.com,[Games],Games,4000000.0,operating,USA,CA,Los Angeles,Los Angeles,2,NaT,NaT,NaT,,2010-06-04,2010-09-23,,<function median at 0x108dbd8c0>


#### 5. Create a pivot table showing total, mean, and median funding amount by year

In [13]:
df.pivot_table(index='founded_year', values = 'funding_total_usd', \
               aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))

Unnamed: 0_level_0,mean,median,sum,<lambda>
founded_year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1900.0,2.365031e+07,23650306.0,2.365031e+07,1.0
1902.0,3.100000e+06,3100000.0,3.100000e+06,1.0
1903.0,9.300000e+06,9300000.0,9.300000e+06,1.0
1906.0,2.275600e+08,10000000.0,1.137800e+09,5.0
1908.0,5.814700e+06,5814700.0,5.814700e+06,1.0
1910.0,1.000000e+05,100000.0,1.000000e+05,2.0
1911.0,2.400000e+06,2400000.0,2.400000e+06,1.0
1912.0,3.392500e+07,6100000.0,1.357000e+08,4.0
1913.0,5.057539e+07,50575389.5,1.011508e+08,2.0
1914.0,1.665000e+07,16650000.0,3.330000e+07,2.0


### Optional: some of these year values seem fishy. Look at the companies with founded_year dates before 1910. Soda Stream definitely wasn't founded in 1906. Reassign founded year as an interpretation of the founded_at column.


In [14]:
# df[df.founded_year < 1910].head()

In [15]:
#Note: the x.year extracts the year from a pd Timestamp
df['founded_year_x2'] = df.founded_at.apply(lambda x: float(x.year))

In [16]:
#Limit the results to values between 1985 and 2016
df2 = df[(df.founded_year_x2 > 1985) & (df.founded_year_x2 < 2016)]

df2.pivot_table(index='founded_year_x2', values = 'funding_total_usd', \
               aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))

Unnamed: 0_level_0,mean,median,sum,<lambda>
founded_year_x2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1986.0,28615010.0,14065000.0,1259061000.0,43.0
1987.0,31136450.0,10000000.0,1681369000.0,49.0
1988.0,18691890.0,8509755.5,1009362000.0,50.0
1989.0,32430440.0,10295200.0,2140409000.0,64.0
1990.0,21398820.0,6334832.5,1412322000.0,63.0
1991.0,39757680.0,8000000.0,2902310000.0,67.0
1992.0,26165980.0,6000000.0,2433436000.0,85.0
1993.0,38227200.0,8000000.0,4090310000.0,94.0
1994.0,23734880.0,11260000.0,2943125000.0,106.0
1995.0,39226960.0,9000000.0,6903944000.0,154.0


#### 6. Create a pivot table showing total, mean and median funding amount by market

In [17]:
df.pivot_table(index='market', values='funding_total_usd', \
              aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))
#This lambda function represents count of values

Unnamed: 0_level_0,mean,median,sum,<lambda>
market,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3D,9.189392e+06,1200000.0,5.513635e+08,57.0
3D Printing,1.967000e+06,2000000.0,9.835000e+06,6.0
3D Technology,,,,1.0
Accounting,1.989102e+07,2592000.0,1.233243e+09,56.0
Active Lifestyle,4.100833e+06,275000.0,2.460500e+07,6.0
Ad Targeting,2.486977e+07,3800000.0,1.318098e+09,48.0
Advanced Materials,5.550000e+07,55500000.0,5.550000e+07,2.0
Adventure Travel,1.706032e+05,145000.0,1.876635e+06,10.0
Advertising,1.195821e+07,2077974.5,2.011370e+10,893.0
Advertising Exchanges,2.648729e+07,3450673.0,7.946187e+07,3.0
