In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

import string

from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize  

import pickle

Функция предварительной обработки текста:

In [2]:
def preprocess(doc):               
    if type(doc) is str:
        return doc.lower().translate(string.maketrans("",""), string.punctuation)    
    else:
        return doc.lower().translate({ord(c): None for c in string.punctuation})    

Загрузка обучающих данных:

In [4]:
train_data = pd.read_csv('data/products_sentiment_train.tsv', sep='\t', header=None, names=['text', 'label'])
train_data.text = train_data.text.apply(lambda s: preprocess(s))
train_data.head()

Unnamed: 0,text,label
0,2 take around 10000 640x480 pictures,1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail and it worked ok...,1


Класс для выполнения лемматизации:

In [5]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

Строим массив признаков, который будет включать частоты слов, биграмм и триграмм:

In [7]:
count_vectorizer = CountVectorizer(ngram_range=(1,3), tokenizer=LemmaTokenizer())
X_train = count_vectorizer.fit_transform(train_data.text)
y_train = train_data.label.values

Обучаем логистическую регрессию с поправкой на несбалансированные классы:

In [8]:
clf = LogisticRegression(class_weight='balanced', C=5)
clf.fit(X_train, y_train)

LogisticRegression(C=5, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [10]:
test = pd.read_csv('data/products_sentiment_test.tsv', delimiter='\t', index_col=None)

In [13]:
predct = clf.predict(test)
result = {}
for indx, elm in enumerate(predct):
    result[indx] = elm
    
subm = pd.Series(result, name='y')
subm.index.name = 'Id'
subm = subm.reset_index()

ValueError: could not convert string to float: everything shines of quality .

Сохраняем модель в файл:

In [9]:
pickle.dump(clf, open('fitted_logregression.sav', 'wb'))
pickle.dump(count_vectorizer, open('fitted_count_vectorizer.sav', 'wb'))

Восстановить можно так:


In [9]:
loaded_model = pickle.load(open('fitted_logregression.sav', 'rb'))
loaded_model.score(X_train, y_train)

1.0