# Logistic Regression

#### Classifier will predict the type of investment the company will get

### Creating Dataset......

In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import plotly.plotly as py

In [2]:
Companies = pd.read_csv('../Dataset/Companies.csv')

In [3]:
print Companies.columns

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city', u'funding_rounds',
       u'founded_year'],
      dtype='object')


In [4]:
Companies.drop(['funding_rounds','founded_year'],axis=1,inplace=True)

In [5]:
print len(Companies)
print len(np.unique(Companies['name']))
print Companies['funding_total_usd'][0]

37875
37875
750,000


In [6]:
Rounds = pd.read_csv('../Dataset/Rounds.csv')

In [7]:
print Rounds.columns

Index([u'company_name', u'funding_round_type', u'funded_year',
       u' raised_amount_usd '],
      dtype='object')


In [8]:
Rounds.rename(columns={'company_name':'name'},inplace=True)

In [9]:
print Rounds.columns

Index([u'name', u'funding_round_type', u'funded_year', u' raised_amount_usd '], dtype='object')


In [10]:
print np.unique(Rounds.funding_round_type)

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [11]:
data = pd.merge(Companies,Rounds,how='inner',on='name')

In [12]:
print data.columns
print len(data)

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city',
       u'funding_round_type', u'funded_year', u' raised_amount_usd '],
      dtype='object')
64107


In [13]:
data.drop(['name','funded_year',' raised_amount_usd ','funding_total_usd'],axis=1,inplace=True)

#### Encoding the categorical features into integers.......

In [14]:
def onehotencode_toIntegers(data):
    
    unique_tokens=np.unique(data['category_code']).copy()
    print "encoding categories...",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['category_code'][data['category_code']==value] = index

    unique_tokens=np.unique(data['status']).copy()
    print "encoding status....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['status'][data['status']==value] = index
        
    unique_tokens=np.unique(data['country_code']).copy()
    print "encoding countries....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['country_code'][data['country_code']==value] = index
        
    unique_tokens=np.unique(data['state_code']).copy()
    print "encoding states....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['state_code'][data['state_code']==value] = index
        
    unique_tokens=np.unique(data['region']).copy()
    print "encoding regions....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['region'][data['region']==value] = index
        
    unique_tokens=np.unique(data['city']).copy()
    print "encoding cities....",len(unique_tokens)
    for index,value in enumerate(unique_tokens):
        data['city'][data['city']==value] = index
        
    unique_tokens=np.unique(data['funding_round_type']).copy()
    print "encoding funding_round_type.....",len(unique_tokens)
    print "unique labels : ",unique_tokens
    for index,value in enumerate(unique_tokens):
        data['funding_round_type'][data['funding_round_type']==value] = index
    
    data.to_csv("data.csv")    
    

In [15]:
print data.columns

Index([u'category_code', u'status', u'country_code', u'state_code', u'region',
       u'city', u'funding_round_type'],
      dtype='object')


In [16]:
data = data.sort(['funding_round_type'])
data.dropna()


sort(columns=....) is deprecated, use sort_values(by=.....)



Unnamed: 0,category_code,status,country_code,state_code,region,city,funding_round_type
30320,manufacturing,operating,USA,CA,SF Bay,LOS ALTOS HILLS,angel
30319,manufacturing,operating,USA,CA,SF Bay,LOS ALTOS HILLS,angel
30262,mobile,operating,USA,CA,SF Bay,San Francisco,angel
30261,mobile,operating,USA,CA,SF Bay,San Francisco,angel
30258,software,operating,USA,CA,San Diego,Carlsbad,angel
30255,consulting,operating,USA,IL,Chicago,Chicago,angel
30304,enterprise,operating,USA,CO,Denver,Boulder,angel
30449,advertising,operating,USA,CA,SF Bay,Mountain View,angel
30438,social,operating,USA,NY,New York,New York,angel
30409,other,operating,USA,CO,Denver,Englewood,angel


#### Removing Noise from the dataset

In [17]:
data = data[data.category_code.notnull()]
data = data[data.status.notnull()]
data = data[data.country_code.notnull()]
data = data[data.state_code.notnull()]
data = data[data.region.notnull()]
data = data[data.city.notnull()]
data = data[data.funding_round_type.notnull()]

In [18]:
print len(data)

42019


In [19]:
Original_data = data.copy()
labels = np.unique(data.funding_round_type)
print labels

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [35]:
from scipy import stats
labels_freq = data.funding_round_type.copy()
freq = stats.itemfreq(labels_freq)
print freq

[['angel' 8185L]
 ['crowdfunding' 71L]
 ['other' 4819L]
 ['post-ipo' 143L]
 ['private-equity' 964L]
 ['series-a' 7174L]
 ['series-b' 3790L]
 ['series-c+' 3708L]
 ['venture' 13165L]]


In [33]:
d = pd.DataFrame()
d['investment_type']= labels
d['freq'] = 0
for i,v in enumerate(freq):
    Index = d.investment_type[d.investment_type==v[0]].index
    print v[1]
    d.at[Index,'freq'] = v[1]

8185
71
4819
143
964
7174
3790
3708
13165


In [36]:
d.to_csv("investment_type.csv")

In [20]:
onehotencode_toIntegers(data)

encoding categories... 43
encoding status.... 4
encoding countries.... 24
encoding states.... 51
encoding regions.... 847
encoding cities.... 2337
encoding funding_round_type..... 9
unique labels :  ['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [21]:
print np.unique(data.funding_round_type)

[0 1 2 3 4 5 6 7 8]


In [22]:
data.dropna()

Unnamed: 0,category_code,status,country_code,state_code,region,city,funding_round_type
30320,19,3,23,4,625,986,0
30319,19,3,23,4,625,986,0
30262,22,3,23,4,625,1792,0
30261,22,3,23,4,625,1792,0
30258,38,3,23,4,642,336,0
30255,5,3,23,14,126,383,0
30304,9,3,23,5,171,233,0
30449,0,3,23,4,625,1285,0
30438,37,3,23,34,482,1352,0
30409,28,3,23,5,171,615,0


Changing datatype of features to integer

In [23]:
data['category_code'] = data['category_code'].astype(int)
data['status'] = data['status'].astype(int)
data['country_code'] = data['country_code'].astype(int)
data['region'] = data['region'].astype(int)
data['state_code'] = data['state_code'].astype(int)
data['city'] = data['city'].astype(int)
data['funding_round_type'] = data['funding_round_type'].astype(int)


#### Creating training and test set by 70 % and 30% ratio

In [24]:
Y = data.funding_round_type

data.drop(['funding_round_type'],axis=1,inplace=True)

In [25]:
print data.columns
print Y[:3]
print len(np.unique(Y))

Index([u'category_code', u'status', u'country_code', u'state_code', u'region',
       u'city'],
      dtype='object')
30320    0
30319    0
30262    0
Name: funding_round_type, dtype: int32
9


In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(data,Y,test_size=0.3)

In [27]:
print "Xtrain : ",X_train.shape
print "Xtest : ",X_test.shape
print "Ytrain : ",Y_train.shape
print "Ytest : ",Y_test.shape

Xtrain :  (29413, 6)
Xtest :  (12606, 6)
Ytrain :  (29413L,)
Ytest :  (12606L,)


### Classification

In [28]:

from sklearn import linear_model

In [29]:
classifier = linear_model.LogisticRegression(penalty='l2',solver='lbfgs',multi_class='multinomial')

In [30]:
classifier.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)

In [31]:
predictions = classifier.predict(X_test)

In [32]:
accuracy = classifier.score(X_test,Y_test)

In [33]:
print accuracy*100,"%"

31.3263525305 %


In [34]:
from sklearn import metrics

In [35]:
print "classification report : "
print(metrics.classification_report(Y_test, predictions))

classification report : 
             precision    recall  f1-score   support

          0       0.24      0.17      0.20      2474
          1       0.00      0.00      0.00        13
          2       0.00      0.00      0.00      1409
          3       0.00      0.00      0.00        37
          4       0.00      0.00      0.00       266
          5       0.00      0.00      0.00      2214
          6       0.00      0.00      0.00      1138
          7       0.00      0.00      0.00      1075
          8       0.32      0.89      0.48      3980

avg / total       0.15      0.31      0.19     12606




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.



In [36]:
print "confusion matrix : "
print(metrics.confusion_matrix(Y_test, predictions))

confusion matrix : 
[[ 420    0    0    0    0    0    0    0 2054]
 [   1    0    0    0    0    0    0    0   12]
 [ 118    0    0    0    0    0    0    0 1291]
 [   2    0    0    0    0    0    0    0   35]
 [  17    0    0    0    0    0    0    0  249]
 [ 349    0    0    0    0    0    0    0 1865]
 [ 177    0    0    0    0    0    0    0  961]
 [ 194    0    0    0    0    0    0    0  881]
 [ 451    0    0    0    0    0    0    0 3529]]


### Visualization

In [37]:
import pylab as pl
import collections

In [38]:
print np.unique(predictions)
print len(np.unique(Y_test))
print np.unique(Y_test)

[0 8]
9
[0 1 2 3 4 5 6 7 8]


In [39]:
print labels

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [40]:
frequencies = collections.Counter(Y_test)
print frequencies

Counter({8: 3980, 0: 2474, 5: 2214, 2: 1409, 6: 1138, 7: 1075, 4: 266, 3: 37, 1: 13})


In [41]:
X = np.arange(len(frequencies))
pl.bar(X, frequencies.values(), align='center', width=0.2)
pl.xticks(X, frequencies.keys())
ymax = max(frequencies.values()) + 1
pl.ylim(0, ymax)
fig = pl.figure()
pl.show()

In [42]:
frequencies = collections.Counter(Y_train)
print frequencies

Counter({8: 9185, 0: 5711, 5: 4960, 2: 3410, 6: 2652, 7: 2633, 4: 698, 3: 106, 1: 58})


In [43]:
for i,v in frequencies.iteritems():
    frequencies[i] /= 29413.0
    frequencies[i] *= 100

print frequencies

Counter({8: 31.227688437085643, 0: 19.416584503450853, 5: 16.863291741746846, 2: 11.593513072450957, 6: 9.016421310304967, 7: 8.951824023391017, 4: 2.373100329786149, 3: 0.3603848638357189, 1: 0.19719171794784618})


In [44]:
size = []
for i,v in frequencies.iteritems():
    size.append(v)

sizes = np.array(size)
sizes = sizes.astype(float)
print sizes

colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral','yellow','blue','green','black','red']
explode = (0.2, 0.2, 0.2, 0.2,0.2,0.2,0.2,0.2,0.2) 

[ 19.4165845    0.19719172  11.59351307   0.36038486   2.37310033
  16.86329174   9.01642131   8.95182402  31.22768844]


In [45]:
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90)
plt.axis('equal')

fig = plt.figure()
ax = fig.gca()

In [46]:
plt.show()