In [3]:
import pandas as pd
import numpy as np
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm

# from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import mean_absolute_error, mean_squared_error, median_absolute_error

from gensim.models import Word2Vec, KeyedVectors

In [4]:
data = pd.read_csv('CLEAN.csv',index_col=0)
# print(data['misarticulation_index'].values[:15])
data = data.loc[data['first_lang_english']==1]
data.drop(columns=['first_lang_english'],inplace=True)
data = data[['response_text','misarticulation_index']]

data['misarticulation_index'] = (data['misarticulation_index']/0.33333).astype(int).astype(float)
# print(data['misarticulation_index'].values[:15])

In [5]:
sentences = []
for text in tqdm(data['response_text']):
    t_tokens = nltk.word_tokenize(text)
    sentences.append(t_tokens)

100%|███████████████████████████████████████| 504/504 [00:00<00:00, 2110.31it/s]


In [6]:
model = Word2Vec(sentences=sentences, vector_size=300, window=5, min_count=1, workers=8)
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [7]:
# wv.key_to_index

In [8]:
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x7ff6a164f040>

In [9]:
text_embs = []
for text in tqdm(data['response_text']):
    emb = np.array(list(map(lambda x:wv[x], nltk.word_tokenize(text)))).mean(0)
    text_embs.append(emb)

100%|███████████████████████████████████████| 504/504 [00:00<00:00, 1686.43it/s]


In [52]:
X = np.array(text_embs)
y = data['misarticulation_index'].values

split_ = np.random.RandomState(seed=0).permutation(X.shape[0])
num_train = int(X.shape[0]*0.7)
X_train, X_test, y_train, y_test = X[:num_train,:], X[num_train:,:], y[:num_train], y[num_train:]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=0)

In [50]:
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html 参数可以自己调
regr = svm.SVR(kernel='linear', epsilon=1)
regr.fit(X_train, y_train)

SVR(epsilon=1, kernel='linear')

In [14]:
#default parameters
ms = [mean_absolute_error, mean_squared_error, median_absolute_error]
for m in ms:
    print(m, m(y_test/3., regr.predict(X_test)/3.))

<function mean_absolute_error at 0x0000015C48C6A438> 0.8796021936060312
<function mean_squared_error at 0x0000015C48C6A168> 1.1963322089436903
<function median_absolute_error at 0x0000015C48C645E8> 0.6676408580712971


In [56]:
#tuned model

ms = [mean_absolute_error, mean_squared_error, median_absolute_error]
for m in ms:
    print(m, m(y_test/3., regr.predict(X_test)/3.))


<function mean_absolute_error at 0x7ff6d9c49940> 0.8596460255896925
<function mean_squared_error at 0x7ff6d9c54310> 1.1724932169336217
<function median_absolute_error at 0x7ff6d9c54550> 0.666695209903245
