In [2]:
import numpy as np
import pandas as pd

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
import tensorflow_hub as hub

In [11]:
use_model_version = '4'
module_url = "https://tfhub.dev/google/universal-sentence-encoder/" + use_model_version
get_use_embd = hub.load(module_url)

In [4]:
df = pd.read_csv("train_set.csv",encoding='latin-1')

In [5]:
df_test = pd.read_csv("test_set.csv")

In [6]:
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
df.label.value_counts()

85389000    2936
85177090    2581
85369090    2438
39269099    2189
73181500    2033
85366990    1944
85238090    1720
85364900    1714
87089900    1673
33041000    1605
87082900    1451
84713010    1331
Name: label, dtype: int64

In [8]:
label_dict = {85389000:0,
85177090:1,
85369090:2,
39269099:3,
73181500:4,
85366990:5,
85238090:6,
85364900:7,
87089900:8,
33041000:9,
87082900:10,
84713010:11}

In [9]:
df['label'] = df['label'].map(label_dict)

In [10]:
df.head()

Unnamed: 0,label,text
0,3,frit adapter mm pk
1,4,screw scrmc bin mc x cr m car audio parts actu...
2,10,vd wc lining pillar c automotive parts for cap...
3,2,tkt st terminal connection and contact elem...
4,0,ra operating mechanism assembly t parts for ci...


In [12]:
df_use_embed = pd.DataFrame(columns=range(512))

In [13]:
# Vectorization sometimes gives resourceExhaustedError here due to high dimensionality(512) of USE features
for i in range(df.shape[0]):
    use_embeddings=get_use_embd(np.array([df.text[i]]))
    df_use_embed=df_use_embed.append(pd.Series(use_embeddings.numpy().ravel()),ignore_index=True)

In [14]:
df = pd.concat([df,df_use_embed],axis=1)

In [17]:
dftrain, dtest = train_test_split(df,train_size=0.85)

In [18]:
model = RandomForestClassifier()

In [19]:
model.fit(dftrain.iloc[:,2:],dftrain['label'])

RandomForestClassifier()

In [20]:
model.score(dtest.iloc[:,2:],dtest['label'])

0.9178662150719729

In [22]:
y_test_pred = model.predict(dtest.iloc[:,2:])

In [23]:
from sklearn.metrics import classification_report
classification_report(dtest['label'], y_test_pred, output_dict=True)

{'0': {'precision': 0.8154761904761905,
  'recall': 0.9405034324942791,
  'f1-score': 0.8735387885228479,
  'support': 437},
 '1': {'precision': 0.9667458432304038,
  'recall': 0.9760191846522782,
  'f1-score': 0.9713603818615751,
  'support': 417},
 '2': {'precision': 0.9,
  'recall': 0.8421052631578947,
  'f1-score': 0.8700906344410877,
  'support': 342},
 '3': {'precision': 0.7938144329896907,
  'recall': 0.8825214899713467,
  'f1-score': 0.8358208955223881,
  'support': 349},
 '4': {'precision': 0.9283154121863799,
  'recall': 0.8519736842105263,
  'f1-score': 0.8885077186963979,
  'support': 304},
 '5': {'precision': 0.9479553903345725,
  'recall': 0.8585858585858586,
  'f1-score': 0.9010600706713782,
  'support': 297},
 '6': {'precision': 1.0,
  'recall': 0.9844961240310077,
  'f1-score': 0.9921875,
  'support': 258},
 '7': {'precision': 0.9752066115702479,
  'recall': 0.9365079365079365,
  'f1-score': 0.9554655870445344,
  'support': 252},
 '8': {'precision': 0.9009009009009009,

In [24]:
print(classification_report(dtest['label'], y_test_pred))

              precision    recall  f1-score   support

           0       0.82      0.94      0.87       437
           1       0.97      0.98      0.97       417
           2       0.90      0.84      0.87       342
           3       0.79      0.88      0.84       349
           4       0.93      0.85      0.89       304
           5       0.95      0.86      0.90       297
           6       1.00      0.98      0.99       258
           7       0.98      0.94      0.96       252
           8       0.90      0.88      0.89       227
           9       1.00      0.99      1.00       243
          10       0.97      0.91      0.94       224
          11       0.98      0.98      0.98       193

    accuracy                           0.92      3543
   macro avg       0.93      0.92      0.92      3543
weighted avg       0.92      0.92      0.92      3543

