# Neural Networks

#### Classifier will predict the type of investment the company will get

### Creating Dataset......

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.neural_network import *
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt

In [3]:
Companies = pd.read_csv('../Dataset/Companies.csv')

In [4]:
print Companies.columns

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city', u'funding_rounds',
       u'founded_year'],
      dtype='object')


In [5]:
Companies.drop(['funding_rounds','founded_year'],axis=1,inplace=True)

In [6]:
print len(Companies)
print len(np.unique(Companies['name']))
print Companies['funding_total_usd'][0]

37875
37875
750,000


In [7]:
Rounds = pd.read_csv('../Dataset/Rounds.csv')

In [8]:
print Rounds.columns

Index([u'company_name', u'funding_round_type', u'funded_year',
       u' raised_amount_usd '],
      dtype='object')


In [9]:
Rounds.rename(columns={'company_name':'name'},inplace=True)

In [10]:
print Rounds.columns

Index([u'name', u'funding_round_type', u'funded_year', u' raised_amount_usd '], dtype='object')


In [11]:
print np.unique(Rounds.funding_round_type)

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [12]:
data = pd.merge(Companies,Rounds,how='inner',on='name')

In [13]:
print data.columns
print len(data)

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city',
       u'funding_round_type', u'funded_year', u' raised_amount_usd '],
      dtype='object')
64107


In [14]:
data.drop(['name','funded_year',' raised_amount_usd ','funding_total_usd'],axis=1,inplace=True)

#### Encoding the categorical features into integers.......

In [15]:
def onehotencode_toIntegers(data):
    
    unique_tokens=np.unique(data['category_code']).copy()
    print "encoding categories...",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['category_code'][data['category_code']==value] = index

    unique_tokens=np.unique(data['status']).copy()
    print "encoding status....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['status'][data['status']==value] = index
        
    unique_tokens=np.unique(data['country_code']).copy()
    print "encoding countries....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['country_code'][data['country_code']==value] = index
        
    unique_tokens=np.unique(data['state_code']).copy()
    print "encoding states....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['state_code'][data['state_code']==value] = index
        
    unique_tokens=np.unique(data['region']).copy()
    print "encoding regions....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['region'][data['region']==value] = index
        
    unique_tokens=np.unique(data['city']).copy()
    print "encoding cities....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['city'][data['city']==value] = index
        
    unique_tokens=np.unique(data['funding_round_type']).copy()
    print "encoding funding_round_type.....",len(unique_tokens)
    print "unique labels : ",unique_tokens
    for index,value in enumerate(unique_tokens):
        data['funding_round_type'][data['funding_round_type']==value] = index
    
    data.to_csv("data.csv")    
    

In [16]:
print data.columns

Index([u'category_code', u'status', u'country_code', u'state_code', u'region',
       u'city', u'funding_round_type'],
      dtype='object')


In [17]:
data = data.sort(['funding_round_type'])
data.dropna()

  if __name__ == '__main__':


Unnamed: 0,category_code,status,country_code,state_code,region,city,funding_round_type
30320,manufacturing,operating,USA,CA,SF Bay,LOS ALTOS HILLS,angel
30319,manufacturing,operating,USA,CA,SF Bay,LOS ALTOS HILLS,angel
30262,mobile,operating,USA,CA,SF Bay,San Francisco,angel
30261,mobile,operating,USA,CA,SF Bay,San Francisco,angel
30258,software,operating,USA,CA,San Diego,Carlsbad,angel
30255,consulting,operating,USA,IL,Chicago,Chicago,angel
30304,enterprise,operating,USA,CO,Denver,Boulder,angel
30449,advertising,operating,USA,CA,SF Bay,Mountain View,angel
30438,social,operating,USA,NY,New York,New York,angel
30409,other,operating,USA,CO,Denver,Englewood,angel


#### Removing Noise from the dataset

In [18]:
data = data[data.category_code.notnull()]
data = data[data.status.notnull()]
data = data[data.country_code.notnull()]
data = data[data.state_code.notnull()]
data = data[data.region.notnull()]
data = data[data.city.notnull()]
data = data[data.funding_round_type.notnull()]

In [19]:
print len(data)

42019


In [20]:
Original_data = data.copy()
labels = np.unique(data.funding_round_type)
print labels

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [21]:
onehotencode_toIntegers(data)

encoding categories... 43
encoding status.... 4
encoding countries.... 24
encoding states.... 51
encoding regions.... 847
encoding cities.... 2337
encoding funding_round_type..... 9
unique labels :  ['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [22]:
print np.unique(data.funding_round_type)

[0 1 2 3 4 5 6 7 8]


In [23]:
data.dropna()

Unnamed: 0,category_code,status,country_code,state_code,region,city,funding_round_type
30320,19,3,23,4,625,986,0
30319,19,3,23,4,625,986,0
30262,22,3,23,4,625,1792,0
30261,22,3,23,4,625,1792,0
30258,38,3,23,4,642,336,0
30255,5,3,23,14,126,383,0
30304,9,3,23,5,171,233,0
30449,0,3,23,4,625,1285,0
30438,37,3,23,34,482,1352,0
30409,28,3,23,5,171,615,0


Changing datatype of features to integer

In [24]:
data['category_code'] = data['category_code'].astype(int)
data['status'] = data['status'].astype(int)
data['country_code'] = data['country_code'].astype(int)
data['region'] = data['region'].astype(int)
data['state_code'] = data['state_code'].astype(int)
data['city'] = data['city'].astype(int)
data['funding_round_type'] = data['funding_round_type'].astype(int)


#### Creating training and test set by 70 % and 30% ratio

In [25]:
Y = data.funding_round_type

data.drop(['funding_round_type'],axis=1,inplace=True)

In [26]:
print data.columns
print Y[:3]
print len(np.unique(Y))

Index([u'category_code', u'status', u'country_code', u'state_code', u'region',
       u'city'],
      dtype='object')
30320    0
30319    0
30262    0
Name: funding_round_type, dtype: int64
9


In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(data,Y,test_size=0.3)

In [58]:
print "Xtrain : ",X_train.shape
print "Xtest : ",X_test.shape
print "Ytrain : ",Y_train.shape
print "Ytest : ",Y_test.shape
Y_train= Y_train.reshape(Y_train.shape[0],1)
Y_test= Y_test.reshape(Y_test.shape[0],1)

Xtrain :  (29413, 6)
Xtest :  (12606, 6)
Ytrain :  (29413, 1)
Ytest :  (12606, 1)


### Classification

In [59]:
import tensorflow as tf # import the tensor flow

In [60]:
n_targets = len(np.unique(Y_train))

In [61]:
#Our Learning  Parameters
learning_rate = 0.001
num_epochs = 5
batch_size = 100
display_step = 1

In [107]:
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

In [112]:
x = tf.placeholder(tf.float32,[None,6])
y = tf.placeholder(tf.float32,[None,1])
w1 = weight_variable([6,200])
b1 = bias_variable([200])
w2 = weight_variable([200,500])
b2 = weight_variable([500])

sm_w = weight_variable([500,n_targets])
sm_b = bias_variable([n_targets])

In [113]:
affine = tf.nn.relu(tf.matmul(x,w1)+b1)
affine2 = tf.nn.relu(tf.matmul(affine,w2)+b2)
drop_prob = tf.placeholder("float")
drop_out = tf.nn.dropout(affine2, drop_prob)
sm_affine = tf.matmul(drop_out,sm_w)+sm_b
sm = tf.nn.softmax(sm_affine)

In [114]:
# define the loss function...
# define the loss function...
cost = -tf.reduce_sum(y*tf.log(sm))

optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)



In [115]:
correct_prediction = tf.equal(tf.argmax(sm, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

In [119]:
init=tf.initialize_all_variables()

In [123]:

saver = tf.train.Saver()
with tf.Session() as session:
    session.run(init) # initalize
    for i in range(num_epochs):
        batch_size=100
        nb = X_train.shape[0]/ batch_size;
        print nb
        pre_batch =0
        for j in range(nb):
            if batch_size <= X_train.shape[0]:
                xs = X_train[pre_batch:batch_size]
                ys = Y_train[pre_batch:batch_size]
                pre_batch = batch_size
                batch_size += 100
                #print batch_size
                session.run(optimizer,feed_dict = {x:xs,y:ys,drop_prob:0.5})
                cost_per_batch = session.run(cost,feed_dict = {x:xs,y:ys,drop_prob:0.5})
            
    print "Optimization Finished!"
    saver.save(session,'model.ckpt')
    print correct_prediction.eval({x: X_test, y: Y_test,drop_prob :1.0})
    print "Accuracy:", accuracy.eval({x: X_test, y: Y_test,drop_prob :1.0}) 


294
294
294
294
294
Optimization Finished!
[ True  True  True ...,  True  True  True]
Accuracy: 1.0


In [78]:
from sklearn import metrics

[array([0, 0, 0, ..., 0, 0, 0])]


In [93]:
from scipy import stats
stats.itemfreq(best)
stats.itemfreq(Y_test)

array([[   0, 2403],
       [   1,   21],
       [   2, 1437],
       [   3,   43],
       [   4,  305],
       [   5, 2137],
       [   6, 1129],
       [   7, 1143],
       [   8, 3988]])