# Linear Regression

#### Classifier will predict future total investment per year 

### Creating Dataset......

In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import plotly.plotly as py

In [2]:
Companies = pd.read_csv('../Dataset/Companies.csv')

In [3]:
print Companies.columns

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city', u'funding_rounds',
       u'founded_year'],
      dtype='object')


In [4]:
Companies.drop(['funding_rounds','founded_year'],axis=1,inplace=True)

In [5]:
print len(Companies)
print len(np.unique(Companies['name']))
print Companies['funding_total_usd'][0]

37875
37875
750,000


In [6]:
Rounds = pd.read_csv('../Dataset/Rounds.csv')

In [7]:
print Rounds.columns

Index([u'company_name', u'funding_round_type', u'funded_year',
       u' raised_amount_usd '],
      dtype='object')


In [8]:
Rounds.rename(columns={'company_name':'name'},inplace=True)

In [9]:
print Rounds.columns

Index([u'name', u'funding_round_type', u'funded_year', u' raised_amount_usd '], dtype='object')


In [10]:
print np.unique(Rounds.funding_round_type)

['angel' 'crowdfunding' 'other' 'post-ipo' 'private-equity' 'series-a'
 'series-b' 'series-c+' 'venture']


In [11]:
data = pd.merge(Companies,Rounds,how='inner',on='name')

In [12]:
print data.columns
print len(data)

Index([u'name', u'category_code', u'funding_total_usd', u'status',
       u'country_code', u'state_code', u'region', u'city',
       u'funding_round_type', u'funded_year', u' raised_amount_usd '],
      dtype='object')
64107


In [13]:
new_data = pd.DataFrame()
new_data['funded_year'] = data['funded_year'].copy()
new_data['raised_amount_usd'] = data[' raised_amount_usd '].copy()
print new_data.columns

Index([u'funded_year', u'raised_amount_usd'], dtype='object')


#### Removing Noise from the dataset

In [14]:
new_data = new_data[new_data.funded_year.notnull()]
new_data = new_data[new_data.raised_amount_usd.notnull()]

In [15]:
print len(new_data)

56049


In [16]:
new_data['raised_amount_usd'] = new_data['raised_amount_usd'].str.replace(',','')
new_data['raised_amount_usd'] = new_data['raised_amount_usd'].astype(long)

In [17]:
new_data.to_csv('data.csv')

In [18]:
funded_years = np.unique(new_data.funded_year).copy()
Data = pd.DataFrame()
Data['funded_years'] = funded_years
result = 0.0
Data['raised_amount_usd'] = 0
Data['raised_amount_usd'] = Data['raised_amount_usd'].astype(long)

for year in funded_years:
    result = 0.0
    Index = Data.funded_years[Data.funded_years==year].index
    indexes = (new_data[new_data['funded_year']==year].index.tolist())
    for i in indexes:
        if i <= 56047:
            result += new_data.at[i,'raised_amount_usd']
    Data.at[Index,'raised_amount_usd'] = result
            

In [19]:
print Data
Data.to_csv("Investments_by_year.csv")

    funded_years  raised_amount_usd
0           1960           52736030
1           1984             173444
2           1987            2500000
3           1989              15000
4           1990            1000000
5           1992                  0
6           1993             125000
7           1994            1740000
8           1995           19000000
9           1996            8012500
10          1997           74800000
11          1998          167609790
12          1999          603015573
13          2000         1467477148
14          2001          806074412
15          2002          769409353
16          2003          938289343
17          2004         2200265765
18          2005        13343014221
19          2006        97211353795
20          2007        24706240065
21          2008        34674627064
22          2009        43656872957
23          2010        52394688617
24          2011        57095895742
25          2012        50600929455
26          2013        8556

Changing datatype of features to integer

In [20]:
Data['funded_years'] = Data['funded_years'].astype(int)
Data['raised_amount_usd'] = Data['raised_amount_usd'].astype(long)


#### Creating training and test set by 70 % and 30% ratio

In [21]:
Y = Data.raised_amount_usd

Data.drop(['raised_amount_usd'],axis=1,inplace=True)

In [42]:
print Data.columns
print Y[:3]
print len(np.unique(Y))

Index([u'funded_years'], dtype='object')
0    52736030
1      173444
2     2500000
Name: raised_amount_usd, dtype: int64
28


In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(Data,Y,test_size=0.3)

In [56]:
print "Xtrain : ",X_train.shape
print "Xtest : ",X_test.shape
print "Ytrain : ",Y_train.shape
print "Ytest : ",Y_test.shape

Xtrain :  (19, 1)
Xtest :  (9, 1)
Ytrain :  (19L,)
Ytest :  (9L,)


### Classification

In [57]:
import matplotlib.pyplot as plt
from sklearn import linear_model

In [58]:
classifier = linear_model.LinearRegression()

In [59]:
classifier.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [60]:
# The coefficients
print('Coefficients: \n', classifier.coef_)
# The mean square error
print("Residual sum of squares: %.2f"
      % np.mean((classifier.predict(X_test) - Y_test) ** 2))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % classifier.score(X_test, Y_test))


('Coefficients: \n', array([  2.22989557e+09]))
Residual sum of squares: 1203599787828744552448.00
Variance score: -0.23


In [61]:
# Plot outputs
plt.scatter(X_test, Y_test,  color='black')
plt.plot(X_test, classifier.predict(X_test), color='blue',
         linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()

#### Total Investment Prediction for 2020

In [76]:
#prediction for 2020
test = ['2015']
test = map(int,test)
test = np.array(test)
prediction = classifier.predict(test)


Passing 1d arrays as data is deprecated in 0.17 and willraise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.



In [77]:
print prediction

[  4.90866430e+10]
