# Doc2Vec for Stock Profile Search

Using document vectors to search for public companies with similar profiles / company descriptions. 

In [1]:
import pandas as pd

In [13]:
#Get Data.

data = pd.read_csv('company_profiles.csv')

In [14]:
data.head()

Unnamed: 0.1,Unnamed: 0,companyName,symbol,sector,industry,description,isActivelyTrading,isEtf,mktCap,market
0,1,Comcast Corp,CMCSA,Communication Services,Entertainment,Comcast Corporation operates as a media and te...,True,False,255350700000.0,US
1,2,Kinder Morgan Inc,KMI,Energy,Oil & Gas Midstream,"Kinder Morgan, Inc. operates as an energy infr...",True,False,39437890000.0,US
2,3,Intel Corp,INTC,Technology,Semiconductors,"Intel Corporation designs, manufactures, and s...",True,False,234850100000.0,US
3,4,Micron Technology Inc,MU,Technology,Semiconductors,"Micron Technology, Inc. designs, manufactures,...",True,False,97742970000.0,US
4,6,General Electric Co,GE,Industrials,Specialty Industrial Machinery,General Electric Company operates as a high-te...,True,False,115395200000.0,US


In [15]:
#Column names.
data.columns

Index(['Unnamed: 0', 'companyName', 'symbol', 'sector', 'industry',
       'description', 'isActivelyTrading', 'isEtf', 'mktCap', 'market'],
      dtype='object')

In [16]:
#Number of rows
data.shape[0]

18598

In [17]:
#Number of companies that are actively trading and not ETFs. 
new_data = data[(data['isEtf'] == False) & (data['isActivelyTrading'] == True)]
new_data.shape[0]

18598

In [18]:
#Number of companies in U.S. market
data[data.market == 'US'].shape[0]

8749

In [19]:
#Different markets in data.
set(data.market)

{'CAN', 'INT', 'US'}

In [20]:
#How many rows contain company descriptions.
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18598 entries, 0 to 18597
Data columns (total 10 columns):
Unnamed: 0           18598 non-null int64
companyName          18594 non-null object
symbol               18597 non-null object
sector               16327 non-null object
industry             16326 non-null object
description          18023 non-null object
isActivelyTrading    18598 non-null bool
isEtf                18598 non-null bool
mktCap               18596 non-null float64
market               18598 non-null object
dtypes: bool(2), float64(1), int64(1), object(6)
memory usage: 1.2+ MB


In [24]:
#Remove all rows with no description.
new_data = new_data.dropna(subset=['description'], how='any').reset_index(drop=True)

In [25]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18023 entries, 0 to 18022
Data columns (total 11 columns):
index                18023 non-null int64
Unnamed: 0           18023 non-null int64
companyName          18022 non-null object
symbol               18022 non-null object
sector               16168 non-null object
industry             16177 non-null object
description          18023 non-null object
isActivelyTrading    18023 non-null bool
isEtf                18023 non-null bool
mktCap               18021 non-null float64
market               18023 non-null object
dtypes: bool(2), float64(1), int64(2), object(6)
memory usage: 1.3+ MB


In [32]:
#Average, smallest and largest length of company description

print('Average: %s' % (new_data.description.str.len().mean()))
print('Smallest: %s' % (new_data.description.str.len().min()))
print('Largest: %s' % (new_data.description.str.len().max()))

Average: 937.6663152638296
Smallest: 40
Largest: 4977
