# Libs

In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
from scipy import spatial
import gensim
from gensim.models import Word2Vec 
# import nltk
# nltk.download('punkt')
from nltk.corpus import stopwords

import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))


In [None]:
#pip install stop-words

## Read files

In [4]:
desc = pd.ExcelFile(r"C:\Users\Aishwarya\Desktop\ISB task\OneDrive_1_01-10-2019\company descriptions.xlsx")
keyw = pd.ExcelFile(r"C:\Users\Aishwarya\Desktop\ISB task\OneDrive_1_01-10-2019\Industry Segments - Top 10 Keywords.xlsx")

desc = desc.parse('Sheet1')
keyw = keyw.parse('Sheet1')

## Combine descriptions and remove special chars

In [5]:
desc.replace(u'\xa0',u' ', regex = True, inplace = True)
desc.replace(u'\n',u' ', regex = True, inplace = True)
desc = desc.replace(np.nan, '', regex = True)

desc['desc_tot'] = desc['company_short_description'] + " " + desc['company_description'] + desc['company_name'].str.split().str[0] + "."

desc['desc_tot'] = desc['desc_tot'].str.replace(r"[\"\',]", '')

## Collapse desc_tot into single string

In [6]:
corp = ' '.join(desc['desc_tot'].tolist())

## Convert to list and remove stop words

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

data = []
  
# iterate through each sentence in the file 
for i in sent_tokenize(corp):
    temp = []
      
    # tokenize the sentence into words 
    for j in word_tokenize(i):
        if j not in stop_words:
            temp.append(j.lower()) 
    data.append(temp)

## word2vec

In [None]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5) 


### Sample results

In [None]:
# Print results 
print("Cosine similarity between 'food' " + 
               "and 'drink' - CBOW : ", 
    model1.similarity('food', 'drink'))

# Print results 
print("Cosine similarity between 'food' " + 
               "and 'machine' - CBOW : ", 
    model1.similarity('food', 'machine')) 

# Print results 
print("Cosine similarity between 'machine' " + 
               "and 'work' - CBOW : ", 
    model1.similarity('machine', 'work')) 

## vector for each company

In [15]:
def get_mean_vector(word2vec_model, words):
    # remove out-of-vocabulary words
    words = [word for word in words if word in word2vec_model.wv.vocab]
    if len(words) >= 1:
        return np.mean(word2vec_model[words], axis=0)
    else:
        return []

In [None]:
vecs = np.zeros((desc.shape[0],100))
err = []

for i in range(0,desc.shape[0]):
    comp_desc = desc.iloc[i,3]

    data = []
    # tokenize the sentence into words 
    for j in word_tokenize(comp_desc):
        if j not in stop_words:
            data.append(j.lower())
    try:
        vecs[i] = get_mean_vector(model1, data) 
        err.append(1)
    except:
        err.append(0)

In [17]:
vecs_all = []

for i in range(0,desc.shape[0]):
    vecs_all.append(vecs[i].tolist())

comps = pd.DataFrame(vecs_all)
comps['list_vec'] = comps.values.tolist()
comps = comps['list_vec'].to_frame()
comps['company'] = desc["company_name"]

## Industry keyword data cleaning

In [18]:
keyw.replace(u'\xa0',u' ', regex=True, inplace=True)
keyw.replace(u'\n',u' ', regex=True, inplace=True)
keyw = keyw.replace(np.nan, '', regex=True)
keyw = keyw.apply(lambda x: x.str.replace(',',''))

In [None]:
vecs = np.zeros((keyw.shape[0],100))
err = []

for i in range(0,keyw.shape[0]):
    comp_desc = keyw.iloc[i,1]

    data = []
    # tokenize the sentence into words 
    for j in word_tokenize(comp_desc):
        if j not in stop_words:
            data.append(j.lower())
    try:
        vecs[i] = get_mean_vector(model1, data)
        err.append(1)
    except:
        err.append(0)

In [20]:
vecs_all = []

for i in range(0,keyw.shape[0]):
    vecs_all.append(vecs[i].tolist())

segs = pd.DataFrame(vecs_all)
segs['list_vec_segs'] = segs.values.tolist()
segs = segs['list_vec_segs'].to_frame()
segs['Industry segment'] = keyw["Industry segment"]

## Matching

In [None]:
comps['key'] = 1
segs['key'] = 1

cross = pd.merge(comps, segs, on = 'key')

In [42]:
for i in range(0, cross.shape[0]):
    vec1 = cross.iloc[i,3]
    vec2 = cross.iloc[i,5]
    result = 1 - spatial.distance.cosine(vec1, vec2)
    cross.iloc[i,2] = result

In [None]:
df_fin = pd.merge(cross, cross.groupby(['company'])["cosine_sim"].max(), on = ["company", "cosine_sim"])

In [None]:
df_fin[['company', 'Industry segment', 'cosine_sim']].to_csv("classify_selftrained_vectors.csv")