In [5]:
import pandas as pd
import numpy as np
import os

In [6]:
DATA_DIR = os.path.abspath(os.path.join(os.getcwd(), "../", "data"))
NAICS_DATA = os.path.join(DATA_DIR, "processed/coverwallet_preprocessed.parquet")

In [7]:
df = pd.read_parquet(NAICS_DATA)

In [8]:
df.head()

Unnamed: 0,NAICS,BUSINESS_DESCRIPTION,NAICS_2,PREPROCESSED_DESCRIPTION
0,722511.0,Zenyai Viet Cajun & Pho Restaurant is dedicate...,72,Zenyai Viet Cajun Pho Restaurant dedicate offe...
1,541330.0,"Kilduff Underground Engineering, Inc. (KUE) is...",54,Kilduff Underground Engineering Inc. KUE geote...
2,453998.0,024™ is a premium home fragrance brand that de...,45,024 ™ premium home fragrance brand design elev...
3,561720.0,Our Services include Office Cleaning Carpet cl...,56,service include office cleaning carpet cleanin...
4,621610.0,NYS Licensed Home Health Agency,62,NYS license Home Health Agency


In [9]:
# calculate tf-idf for each document
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["PREPROCESSED_DESCRIPTION"])

Let's calculate the cosine similarity between the texts in naics 11 and the rest of the descriptions.


In [10]:
X_11 = X[df["NAICS_2"] == 11]

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(X_11, X)

In [12]:
cosine_sim.shape

(26, 14175)

In [35]:
def get_similar_docs(df, cosine_sim, doc_id, top_n=10):
    similar_docs = np.argsort(cosine_sim[doc_id])[::-1][1 : top_n + 1]
    return df.iloc[similar_docs]

In [36]:
similar_11 = []
for i in range(len(cosine_sim)):
    similar_11.append(get_similar_docs(df, cosine_sim, i, top_n=10))

df_similar_11 = pd.concat(similar_11)

In [37]:
df_similar_11["NAICS_2"].value_counts()

NAICS_2
56    64
54    54
42    35
23    29
11    13
33     8
31     8
48     7
45     6
72     5
53     5
81     5
61     5
44     3
32     3
49     3
22     2
51     2
62     1
52     1
21     1
Name: count, dtype: int64

In [25]:
similar_docs = np.argsort(cosine_sim[1])[::-1][:11]

df.iloc[similar_docs]

Unnamed: 0,NAICS,BUSINESS_DESCRIPTION,NAICS_2,PREPROCESSED_DESCRIPTION
611,115310.0,Allison Tree Consulting directs and monitors w...,11,Allison Tree Consulting direct monitor work tr...
9084,541990.0,Samudaworth provides tree services such as tre...,54,Samudaworth provide tree service tree removal ...
12886,561730.0,"Tree planting, tree removal, tree support syst...",56,tree planting tree removal tree support system...
3709,561730.0,"Tree pruning, tree removal, stump grinding and...",56,tree pruning tree removal stump grinding tree ...
2239,561730.0,Full Service Company specializing in tree serv...,56,Service company specialize tree service landsc...
7913,561730.0,"Tree removal, tree pruning, stump grinding, pl...",56,tree removal tree pruning stump grinding plant...
7612,238990.0,NYC Tree Pit Services is a social enterprise d...,23,NYC Tree Pit Services social enterprise dedica...
6426,115310.0,Certified Arborists (Tree Consultants) on-site...,11,Certified Arborists Tree Consultants site dire...
13249,561730.0,Parshall Tree Service is a full service tree c...,56,Parshall Tree Service service tree company spe...
12829,561730.0,We are a full-service professional tree servic...,56,service professional tree service company fami...


Each document has a pair of keywords that are the most important for the description. For example, for the las document in naics 11, the word "tree" appears in all the similar documents.

The problem here is that not all businesses that are similar to each other are classified in the same category. For example, the descriptions in naics 11 (Agriculture, Forestry, Fishing and Hunting) are quite similar in general to the descriptions in naics 56 (Administrative and Support and Waste Management and Remediation Services). This shows that the descriptions in naics 56 are not very specific and can be applied to a wide range of businesses or that these businesses are wrongly classified.


Let's do the same for naics 54.


In [39]:
X_54 = X[df["NAICS_2"] == 54]

In [40]:
cosine_sim_54 = cosine_similarity(X_54, X)

Now we are using top 5 instead of top 10 similar because the naics 54 has much more instances.


In [42]:
similar_54 = []
for i in range(len(cosine_sim_54)):
    similar_54.append(get_similar_docs(df, cosine_sim_54, i, top_n=5))

df_similar_54 = pd.concat(similar_54)

In [43]:
df_similar_54["NAICS_2"].value_counts()

NAICS_2
54    14615
23     1522
56      880
61      790
51      545
81      396
71      384
53      375
62      327
42      238
52      195
33      151
32      124
48      114
45       75
44       68
92       60
72       47
31       42
49       28
55       13
22       12
11        5
21        4
Name: count, dtype: int64

As a difference from the previous case, the descriptios in naics 54 (Professional, Scientific, and Technical Services) are similar to the descriptions in the same naics in most cases. This shows that the descriptions in naics 54 are more specific and can be applied to a narrower range of businesses.

The next most similar to this naics is naics 23 (Construction).


Now let's try with any naics that was wrongly classified in the baseline model, for example, naics 44.


In [47]:
X_44 = X[df["NAICS_2"] == 44]

In [49]:
cosine_sim_44 = cosine_similarity(X_44, X)

In [50]:
similar_44 = []
for i in range(len(cosine_sim_44)):
    similar_44.append(get_similar_docs(df, cosine_sim_44, i, top_n=10))

df_similar_44 = pd.concat(similar_44)

In [51]:
df_similar_44["NAICS_2"].value_counts()

NAICS_2
23    313
42    305
44    214
54    144
33    103
81     70
72     68
56     46
45     43
31     30
53     28
48     27
51     26
32     20
61     15
49     11
71      6
62      5
52      2
22      2
11      1
92      1
Name: count, dtype: int64

In this case, the descriptions in naics 44 (Retail Trade) are similar to the descriptions in naics 23 (Manufacturing) and naics 42 (Wholesale Trade). It is strange that the descriptions in naics 44 are not similar to the descriptions in naics 45 (Retail Trade) which is in the same correct classification for this naics.


Finaly, let's see what happens with naics 33


In [52]:
X_33 = X[df["NAICS_2"] == 33]

In [53]:
cosine_sim_33 = cosine_similarity(X_33, X)

In [61]:
similar_33 = []
for i in range(len(cosine_sim_33)):
    similar_33.append(get_similar_docs(df, cosine_sim_33, i, top_n=10))

df_similar_33 = pd.concat(similar_33)

In [62]:
df_similar_33["NAICS_2"].value_counts()

NAICS_2
33    2297
23    1530
54     670
42     660
81     256
56     238
62     193
32     184
44     173
45     106
31      81
53      81
48      72
61      71
51      70
71      36
72      28
22      24
49      21
52      12
92       8
21       5
55       2
11       2
Name: count, dtype: int64

For naics 33 (Manufacturing), the descriptions are similar to the descriptions in naics 23 (Construction) primarily. This shows that the descriptions in naics 33 are not very specific and can be applied to a wide range of businesses or that these businesses are wrongly classified.

In addition, this naics is classified in the same category as naics 31 and naics 32, but the descriptions seem to be not very similar to each other.


NAICS_2
54    4202
23    2976
56    1159
61     751
33     682
42     650
62     517
81     488
51     380
72     333
53     331
71     300
48     270
32     241
45     240
31     184
52     162
44     148
49      45
22      32
92      30
11      26
55      15
21      13
Name: count, dtype: int64