In [10]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import csv

import numpy as np 
import pandas as pd 

In [2]:
#If you need to download the dataset

import opendatasets as od
od.download("https://www.kaggle.com/datasets/weipengfei/ohr8r52")

Skipping, found downloaded files in ".\ohr8r52" (use force=True to force download)


In [3]:
import os
for dirname, _, filenames in os.walk('./ohr8r52'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./ohr8r52\oh\oh-dev-stemmed.csv
./ohr8r52\oh\oh-test-stemmed.csv
./ohr8r52\oh\oh-train-stemmed.csv
./ohr8r52\r52\r52-dev-stemmed.csv
./ohr8r52\r52\r52-test-stemmed.csv
./ohr8r52\r52\r52-train-stemmed.csv
./ohr8r52\r8\r8-dev-stemmed.csv
./ohr8r52\r8\r8-test-stemmed.csv
./ohr8r52\r8\r8-train-stemmed.csv


In [4]:
train_r8 = pd.read_csv('./ohr8r52/r8/r8-train-stemmed.csv')
test_r8 = pd.read_csv('./ohr8r52/r8/r8-test-stemmed.csv')
train_r8.head()

Unnamed: 0,text,edge,intent
0,champion product approv stock split champion p...,champion product approv stock split champion p...,earn
1,comput termin system cpml complet sale comput ...,comput termin system cpml complet sale comput ...,acq
2,cobanco inc cbco year net shr ct dlr net asset...,cobanco inc cbco year net shr ct dlr net asset...,earn
3,intern inc qtr jan oper shr loss two ct profit...,intern inc qtr jan oper shr loss two ct profit...,earn
4,brown forman inc bfd qtr net shr dlr ct net ml...,brown forman inc bfd qtr net shr dlr ct net ml...,earn


In [5]:
class GloveVectorizer:
  def __init__(self):
    # load in pre-trained word vectors
    print('Loading word vectors...')
    word2vec = {}
    embedding = []
    idx2word = []
    with open('glove.6B.50d.txt',encoding="utf8") as f:
      # is just a space-separated text file in the format:
      # word vec[0] vec[1] vec[2] ...
      for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
    print('Found %s word vectors.' % len(word2vec))

    self.word2vec = word2vec
    self.embedding = np.array(embedding)
    self.word2idx = {v:k for k,v in enumerate(idx2word)}
    self.V, self.D = self.embedding.shape

  def fit(self, data):
    pass

  def transform(self, data):
    X = np.zeros((len(data), self.D))
    n = 0
    emptycount = 0
    for sentence in data:
      tokens = sentence.lower().split()
      vecs = []
      for word in tokens:
        if word in self.word2vec:
          vec = self.word2vec[word]
          vecs.append(vec)
      if len(vecs) > 0:
        vecs = np.array(vecs)
        X[n] = vecs.mean(axis=0)
      else:
        emptycount += 1
      n += 1
    print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
    return X

  def fit_transform(self, data):
    self.fit(data)
    return self.transform(data)

In [6]:
vectorizer = GloveVectorizer()
X_train = vectorizer.fit_transform(train_r8.text)
Y_train = train_r8.intent

X_test = vectorizer.transform(test_r8.text)
Y_test = test_r8.intent

Loading word vectors...
Found 400000 word vectors.
Numer of samples with no words found: 0 / 4937
Numer of samples with no words found: 0 / 2189


In [11]:
text_clf = LinearSVC()
text_clf.fit(X_train, Y_train)

In [12]:
predicted = text_clf.predict(X_test)
print(metrics.classification_report(Y_test, predicted))

              precision    recall  f1-score   support

         acq       0.92      0.94      0.93       696
       crude       0.91      0.88      0.90       121
        earn       0.96      0.98      0.97      1083
       grain       1.00      0.60      0.75        10
    interest       0.78      0.83      0.80        81
    money-fx       0.76      0.47      0.58        87
        ship       0.78      0.69      0.74        36
       trade       0.79      0.91      0.84        75

    accuracy                           0.92      2189
   macro avg       0.86      0.79      0.81      2189
weighted avg       0.92      0.92      0.92      2189



In [14]:
#USING OH

train_r8 = pd.read_csv('./ohr8r52/oh/oh-train-stemmed.csv')
test_r8 = pd.read_csv('./ohr8r52/oh/oh-test-stemmed.csv')
train_r8.head()

#Vectorize the new dataset entries
X_train = vectorizer.fit_transform(train_r8.text)
Y_train = train_r8.intent
X_test = vectorizer.transform(test_r8.text)
Y_test = test_r8.intent

#Compute a new model with the new dataset
text_clf = LinearSVC()
text_clf.fit(X_train, Y_train)

#Predict
predicted = text_clf.predict(X_test)
print(metrics.classification_report(Y_test, predicted))

Numer of samples with no words found: 0 / 3021
Numer of samples with no words found: 0 / 4043
              precision    recall  f1-score   support

         C01       0.38      0.35      0.36       102
         C02       0.75      0.06      0.11        50
         C03       0.69      0.31      0.43        29
         C04       0.52      0.78      0.62       600
         C05       0.51      0.26      0.35       140
         C06       0.34      0.29      0.31       178
         C07       0.75      0.09      0.16        34
         C08       0.44      0.06      0.11       129
         C09       0.40      0.07      0.12        28
         C10       0.44      0.37      0.41       342
         C11       0.50      0.22      0.31        76
         C12       0.24      0.18      0.20       187
         C13       0.49      0.38      0.43       103
         C14       0.50      0.85      0.63       590
         C15       0.33      0.03      0.05        79
         C16       0.00      0.00      0.

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
#USING r52

train_r8 = pd.read_csv('./ohr8r52/r52/r52-train-stemmed.csv')
test_r8 = pd.read_csv('./ohr8r52/r52/r52-test-stemmed.csv')
train_r8.head()

#Vectorize the new dataset entries
X_train = vectorizer.fit_transform(train_r8.text)
Y_train = train_r8.intent
X_test = vectorizer.transform(test_r8.text)
Y_test = test_r8.intent

#Compute a new model with the new dataset
text_clf = LinearSVC()
text_clf.fit(X_train, Y_train)

#Predict
predicted = text_clf.predict(X_test)
print(metrics.classification_report(Y_test, predicted))

Numer of samples with no words found: 0 / 5879
Numer of samples with no words found: 0 / 2568
                 precision    recall  f1-score   support

            acq       0.86      0.95      0.90       696
           alum       0.67      0.32      0.43        19
            bop       0.50      0.22      0.31         9
        carcass       1.00      0.20      0.33         5
          cocoa       0.82      0.60      0.69        15
         coffee       0.64      0.73      0.68        22
         copper       0.60      0.23      0.33        13
         cotton       0.88      0.78      0.82         9
            cpi       0.50      0.65      0.56        17
            cpu       0.00      0.00      0.00         1
          crude       0.74      0.88      0.80       121
            dlr       0.00      0.00      0.00         3
           earn       0.94      0.98      0.96      1083
           fuel       1.00      0.29      0.44         7
            gas       1.00      0.12      0.22    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
