# Test on company names

In [1]:
import geopandas

  shapely_geos_version, geos_capi_version_string


In [2]:
import pandas as pd
import numpy as np

import wikipedia

import re

import spacy

from tqdm import tqdm
import random

#to install pycountry:
#pip install pycountry
import pycountry
#pycountry permits to find the isoa3 name of the countries



  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


## Importing the dataset

In [3]:
df_companies = pd.read_csv('Data/company_dataset/companies_sorted.csv')

In [4]:
df_companies.head()

Unnamed: 0.1,Unnamed: 0,name,domain,year founded,industry,size range,locality,country,linkedin url,current employee estimate,total employee estimate
0,5872184,ibm,ibm.com,1911.0,information technology and services,10001+,"new york, new york, united states",united states,linkedin.com/company/ibm,274047,716906
1,4425416,tata consultancy services,tcs.com,1968.0,information technology and services,10001+,"bombay, maharashtra, india",india,linkedin.com/company/tata-consultancy-services,190771,341369
2,21074,accenture,accenture.com,1989.0,information technology and services,10001+,"dublin, dublin, ireland",ireland,linkedin.com/company/accenture,190689,455768
3,2309813,us army,goarmy.com,1800.0,military,10001+,"alexandria, virginia, united states",united states,linkedin.com/company/us-army,162163,445958
4,1558607,ey,ey.com,1989.0,accounting,10001+,"london, greater london, united kingdom",united kingdom,linkedin.com/company/ernstandyoung,158363,428960


## Constructing the sentences

In [5]:
df_companies['size range'].unique()

array(['10001+', '5001 - 10000', '1001 - 5000', '501 - 1000', '201 - 500',
       '51 - 200', '11 - 50', '1 - 10'], dtype=object)

In [6]:
df_companies['industry'].value_counts()

information technology and services    453044
marketing and advertising              338162
construction                           257921
management consulting                  194556
real estate                            193486
                                        ...  
fishery                                  2355
ranching                                 2324
railroad manufacture                     1818
tobacco                                  1354
legislative office                        524
Name: industry, Length: 148, dtype: int64

To reduce the size of the dataset, we are only taking company names of size > `10001+`

In [7]:
df_companies_reduced = df_companies[df_companies['size range'].isin(['10001+', '5001 - 10000'])].copy()
df_companies_reduced = df_companies_reduced[df_companies_reduced['industry'].isin(['information technology and services', 'marketing and advertising', 'construction', 'management consulting'])].copy()

In [8]:
df_companies_reduced['country'].unique()

array(['united states', 'india', 'ireland', 'france', 'sweden', 'canada',
       nan, 'australia', 'spain', 'germany', 'czechia', 'south africa',
       'malaysia', 'finland', 'brazil', 'switzerland', 'greece',
       'netherlands', 'liechtenstein', 'united kingdom', 'mexico',
       'austria', 'singapore', 'argentina', 'chile', 'indonesia', 'italy',
       'south korea', 'japan', 'israel', 'new zealand', 'pakistan',
       'hong kong', 'denmark', 'norway', 'china', 'peru', 'egypt',
       'saudi arabia', 'united arab emirates', 'kuwait', 'belgium'],
      dtype=object)

There are still a good number of countries.

We are taking only two companies for each country (by size). 

In [9]:
df_companies_largest = df_companies_reduced.groupby('country')['current employee estimate', 'name']\
.apply(lambda x : x.nlargest(5, columns=['current employee estimate'])).reset_index()

  """Entry point for launching an IPython kernel.


In [10]:
df_companies_largest

Unnamed: 0,country,level_1,current employee estimate,name
0,argentina,1932,4426,globant
1,argentina,5908,1797,techint engineering & construction
2,australia,284,17581,fujitsu
3,australia,778,8719,lendlease
4,australia,1563,5170,aurecon
...,...,...,...,...
109,united states,0,274047,ibm
110,united states,5,127952,hewlett-packard
111,united states,6,122031,cognizant technology solutions
112,united states,14,104112,deloitte


## Constructing the templates, inspired by the winogender schemas

Selecting some sentences in the wikipedia summary of some pages

In [11]:
list_sentences=[]
list_companies=[]
for company in df_companies_largest['name'].unique():
    try:
        summary = wikipedia.summary(company)
        for sentence in summary.split('. '):
            if company.lower() in sentence.lower():
                list_sentences.append(sentence)
                list_companies.append(company)
    except:
        print('not found : ', company)
        
#Constructing the dataframe:
df_sentences_template_companies = pd.DataFrame(data = {'sentences':list_sentences, 'name':list_companies})

not found :  jsmnet
not found :  hospital & healthcare brazil
not found :  algar tech




  lis = BeautifulSoup(html).find_all('li')


not found :  serpro
not found :  cgi
not found :  sonda
not found :  china state construction engrg. corp. ltd(cscec)
not found :  tieto
not found :  ccc
not found :  asl
not found :  infosys
not found :  engineering ingegneria informatica spa
not found :  libero professionista geometra
not found :  ntt data italia
not found :  myself (as an independent consultant)
not found :  sinsilog's network and marketing
not found :  gym s.a. - grupo graña y montero
not found :  samsung sds
not found :  indra
not found :  everis
not found :  tecnocom
not found :  ncc
not found :  sita
not found :  amaris
not found :  arabtec construction llc
not found :  oracle


In [12]:
df_sentences_template_companies.to_csv('Data/templates/sentence_templates_companies.csv', index=False)

In [13]:
df_sentences_template_companies.head()

Unnamed: 0,sentences,name
0,Globant is an IT and Software Development comp...,globant
1,Lendlease Group is an Australian multinational...,lendlease
2,"Aurecon is an engineering, management, design...",aurecon
3,STRABAG SE is an Austrian construction company...,strabag
4,In these markets STRABAG has subsidiaries or o...,strabag


In [14]:
df_sentences_template_companies.shape

(114, 2)

**Do we have to perform some additional cleaning ?**

In [15]:
def clean_template(row):
    """
    function clean_template applied to each row in order to replace the company names in the sentences by '$COMPANY'
    to use them after in the construction of the sentences. 
    """
    #Check for company names in the sentence, ignoring the case:
    sent = row['sentences']
    while re.search(row['name'], sent, re.IGNORECASE):
        match = re.search(row['name'], sent, re.IGNORECASE)
        sent = sent[:match.span()[0]] + '$COMPANY' + sent[match.span()[1]:]
        return sent

In [16]:
df_sentences_template_companies['clean_sentence'] = df_sentences_template_companies.apply(clean_template, axis=1)

In [17]:
df_sentences_template_companies.head()

Unnamed: 0,sentences,name,clean_sentence
0,Globant is an IT and Software Development comp...,globant,$COMPANY is an IT and Software Development com...
1,Lendlease Group is an Australian multinational...,lendlease,$COMPANY Group is an Australian multinational ...
2,"Aurecon is an engineering, management, design...",aurecon,"$COMPANY is an engineering, management, desig..."
3,STRABAG SE is an Austrian construction company...,strabag,$COMPANY SE is an Austrian construction compan...
4,In these markets STRABAG has subsidiaries or o...,strabag,In these markets $COMPANY has subsidiaries or ...


In [18]:
df_sentences_template_companies.to_csv('Data/templates/sentence_templates_companies_clean.csv', index=False)

In [19]:
df_sentences_template_companies.dropna(inplace=True)

## Utils 

In [20]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_md = spacy.load("en_core_web_md")
nlp_lg = spacy.load("en_core_web_lg")
nlp_trf = spacy.load("en_core_web_trf")

In [21]:
def predict_NER_spacy(text, model):
    """
    model: str 
        either:
            'sm'
            'md'
            'lg'
            'trf'
    """
    if model=='sm':
        doc=nlp_sm(text)
    elif model=='md':
        doc=nlp_md(text)
    elif model=='lg':
        doc=nlp_lg(text)
    elif model=='trf':
        doc=nlp_trf(text)
    else:
        return np.nan
        
    predicted_label_liste=[str(ent).strip() for ent in doc.ents]
    return predicted_label_liste

In [22]:
def check_names_prediction_company(row):
    var_0=0
    if row['name'] in row['spacy_prediction']:
        var_0 = 1
    return var_0

In [23]:
def compute_stats_names(df_test_company, model):
    """
    Model is a str of the name of the model
    
    """
    #Creating a new dataframe:
    df_temp=pd.DataFrame()
    df_temp['score_'+model] = pd.concat([df_test_company['found']])
    df_temp['name'] = pd.concat([df_test_company['name']])
    #We then have one 'mean' score per name:
    df_temp = df_temp.groupby('name').agg({'score_'+model:'mean'}).reset_index(drop=False)
    return df_temp

### Constructing the sentences

In [24]:
def compute_random_sentences_companies(row, arr_names, template):
    #Firstly we need to take one sentence at random:
    str_sentence = template.sample(1)['clean_sentence'].values[0]
    #The names should not be all the same: 
    random_company = random.choice(arr_names)
    #we replace the right words to form the finale sentence: 
    str_sentence=str_sentence.replace('$COMPANY', random_company)
    return str_sentence, random_company

### Main

In [25]:
def main_companies(n_sentences, arr_names, template):
    #computing the sentences:
    df_test_company = pd.DataFrame(index=range(n_sentences))
    df_test_company['tt']= 1 
    df_test_company['sentence'], df_test_company['name'] = zip(*df_test_company.apply(lambda x: compute_random_sentences_companies(x, arr_names, template), axis=1))
    del df_test_company['tt']
    
    
    #We define the df of the results:
    df_results=pd.DataFrame(data={'name':arr_names})
    #Then, we apply the models:
    for model in ['sm', 'md', 'lg', 'trf']:
        
        print('Testing the model: '+model)
        
        tqdm.pandas()
        df_test_company['spacy_prediction'] = df_test_company['sentence'].progress_apply(lambda x: predict_NER_spacy(x, model))
        #We replace the list with a string, because otherwise for a list like : [Emily if Camila], neither Emily or Camila 
        #Would be mark as 1 by the test above. 
        df_test_company['spacy_prediction'] = df_test_company['spacy_prediction'].apply(lambda l : ' '.join(l))
        #We apply the function to check if the names were understood as NER by the model: 
        df_test_company['found'] = df_test_company.apply(check_names_prediction_company, axis=1)
        
        
        #computing the stats:
        #And merging with the results df:
        df_results = df_results.merge(compute_stats_names(df_test_company, model), left_on='name', right_on='name', how='right')
        

    return df_results

## Defining the company names to be used

We are only taking the tech companies

In [27]:
df_companies_tech = df_companies[df_companies['industry']=='information technology and services']

In [29]:
#We are only taking the countries with at least 5 companies
countries = df_companies_tech['country'].value_counts()[df_companies_tech['country'].value_counts()>=5].keys().to_list()

In [30]:
df_companies_tech = df_companies_tech[df_companies_tech['country'].isin(countries)]

In [32]:
# We are only taking 5 companies for each country - the largest 
df_companies_tech_largest = df_companies_tech.groupby('country')['current employee estimate', 'name']\
.apply(lambda x : x.nlargest(5, columns=['current employee estimate'])).reset_index()

del df_companies_tech_largest['level_1']

  


In [33]:
arr_names_companies = df_companies_tech_largest['name'].unique()

## Applying the models

We can apply the models to each company and each sentence. 

In [None]:
df_results_companies = main_companies(10000, arr_names_companies, df_sentences_template_companies)

### Merging to have the results over the countries

In [None]:
df_results_companies = df_results_companies.merge(df_companies_tech_largest, right_on='name', left_on='name', how='left')

In [None]:
df_results_companies = df_results_companies.groupby('country').agg({'score_sm':'mean', 'score_md':'mean', 'score_lg':'mean', 'score_trf':'mean'})
#Compute avg score : 
df_results_companies['avg_score']=df_results_companies[['score_sm', 'score_md', 'score_lg', 'score_trf']].mean(axis=1)

### Final results

In [None]:
df_results_companies.sort_values('avg_score', ascending=False)

In [None]:
df_results_companies.to_csv('Data/results_companies_clean')

#### Plots

In [None]:
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))

In [None]:
world.head()

## Other tests to avoid biases

Maybe the test is biased because some countries have companies with very low employee estimates. 