In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in the raw data from the github url

df = pd.read_csv("https://raw.githubusercontent.com/suneel0101/lesson-plan/master/crunchbase_monthly_export.csv")

#### 1. Clean the data. Use .apply and lambda functions where you can

In [3]:
# rename the columns to remove spaces and extra characters
print list(df.columns)

['permalink', 'name', 'homepage_url', 'category_list', ' market ', ' funding_total_usd ', 'status', 'country_code', 'state_code', 'region', 'city', 'funding_rounds', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'first_funding_at', 'last_funding_at', 'Unnamed: 18']


In [4]:
new_cols = ['permalink', 'name', 'homepage_url', 'category_list', 'market', 'funding_total_usd', 'status', 'country_code', 'state_code', 'region', 'city', 'funding_rounds', 'founded_at', 'founded_month', 'founded_quarter', 'founded_year', 'first_funding_at', 'last_funding_at', 'Unnamed: 18']
df.columns = new_cols

In [5]:
# convert the funding, year founded, and funding_rounds columns to numeric values
custom_null = df['funding_total_usd'][9]
custom_null

' -   '

In [6]:
df['funding_total_usd'].replace(custom_null, np.nan, inplace=True)

def _dollars_to_int(x):
    try:
        y = (str(x)).replace(',','')
        return int(y)
    except:
        pass
        
df['funding_total_usd'] = df['funding_total_usd'].apply(_dollars_to_int)

In [7]:
# convert category_list so that each cell is a list of categories
df.category_list = df.category_list.apply(lambda x: (str(x)).split('|')[1:-1])

In [None]:
# use value_counts on market, status, and country_code to check for any repeats or misnamed entries


In [None]:
# BONUS: convert the date columns to appropriate datetime objects
%time df.founded_at = df.founded_at.apply(lambda x: pd.to_datetime(x))
%time df.founded_at = df.founded_at.apply(lambda x: pd.to_datetime(x))
%time df.founded_month = df.founded_month.apply(lambda x: pd.Period(x, freq= "M"))
%time df.founded_quarter = df.founded_quarter.apply(lambda x: pd.Period(x, freq= "Q"))

def _convert_time(x):
    try:
        return pd.to_datetime(x)
    except:
        return np.nan

%time df.last_funding_at = df.last_funding_at.apply(_convert_time)
%time df.first_funding_at = df.first_funding_at.apply(_convert_time)


CPU times: user 16.3 s, sys: 228 ms, total: 16.5 s
Wall time: 16.7 s
CPU times: user 1.44 s, sys: 121 ms, total: 1.56 s
Wall time: 1.56 s
CPU times: user 4.73 s, sys: 95.9 ms, total: 4.83 s
Wall time: 5.01 s
CPU times: user 3.4 s, sys: 88.1 ms, total: 3.49 s
Wall time: 3.49 s
CPU times: user 19.7 s, sys: 309 ms, total: 20.1 s
Wall time: 20.3 s


#### 2. Using isnull(), take a subset of the dataframe that has no founded at date

In [None]:
df[df.founded_at.isnull()].head(2)

#### 3. Create a new column, founded_year_x, that's a copy of founded_year. Then, replace all the missing values with either the mean , mode, or median of that column. 
Add a note explaining which statistic you picked and why:

In [None]:
df['founded_year_x'] = df['founded_year'].fillna(np.median)

#### 4. Take a subset of US-based companies using boolean selection

In [None]:
df[df.country_code == 'USA'].head(2)

#### 5. Create a pivot table showing total, mean, and median funding amount by year

In [None]:
df.pivot_table(index='founded_year', values = 'funding_total_usd', \
               aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))

### Optional: some of these year values seem fishy. Look at the companies with founded_year dates before 1910. Soda Stream definitely wasn't founded in 1906. Reassign founded year as an interpretation of the founded_at column.


In [None]:
# df[df.founded_year < 1910].head()

In [None]:
#Note: the x.year extracts the year from a pd Timestamp
df['founded_year_x2'] = df.founded_at.apply(lambda x: float(x.year))

In [None]:
#Limit the results to values between 1985 and 2016
df2 = df[(df.founded_year_x2 > 1985) & (df.founded_year_x2 < 2016)]

df2.pivot_table(index='founded_year_x2', values = 'funding_total_usd', \
               aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))

#### 6. Create a pivot table showing total, mean and median funding amount by market

In [None]:
df.pivot_table(index='market', values='funding_total_usd', \
              aggfunc=(np.mean, np.median, sum, lambda x: len(x.unique())))
#This lambda function represents count of values