## Preparing the data for classification models

In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv('datalocation')

df.rename(columns={'Startup or Person': 'TeamSolo'}, inplace=True)
df.rename(columns={'Average Score': 'InitScore'}, inplace=True)

## Convert Target Variables to Numerical

In [12]:
df['CTOnum'] = df['CTO'].astype('int')
df['FemNum'] = df['Female'].astype('int')
df['Funded'].replace(['Yes','No'],[1,0],inplace=True)
df['InitScore']= pd.to_numeric(df['InitScore'])
df['SelectionScore']= pd.to_numeric(df['SelectionScore'])

#2 means 2+ co-founders, 1 means solo founder
df['TeamSolo'].replace(['Team', 'Person'], [2, 1], inplace=True) 

#replace NaNs with mean or mode 
df['InitScore'].fillna((df['InitScore'].mean()), inplace=True)
df['CTOnum'].fillna((df['CTOnum'].mode()), inplace=True)
df['FemNum'].fillna((df['FemNum'].mode()),  inplace=True)
df['TeamSolo'].fillna((df['TeamSolo'].mode()), inplace=True)

## Calculate distance from NYC to use as 'location' parameter

In [3]:
#if there are errors, make sure all Lat and Long have a value
from geopy.distance import vincenty

#nyc = (40.7306458,-73.9866136)

def pdVincenty(row):
    return vincenty((row.Lat, row.Long), (40.7306458,-73.9866136)).miles 

df['dist'] =  df.apply(lambda r: pdVincenty(r), axis=1)

In [4]:
df.to_csv('datalocation')