In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

Посчитаем сколько упоминаний каждого типа тональности содержит каждая категория. Для этого нам понадобится файл с размеченными упоминаниями категорий и тональностью каждого упоминания

In [5]:
def count_sentiments(df_asp, df_sent):
    text_ids = df_asp['text_id'].unique()
    frames = []
    for j in text_ids:
        neutral = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        frequency = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        both = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        positive = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        negative = {'Whole':0, 'Service':0, 'Food':0, 'Interior':0, 'Price':0}
        asp_slice = df_asp.loc[df_asp['text_id']==j]
        asp_slice = asp_slice.reset_index()
        for i in range(len(asp_slice)):
            if asp_slice['sentiment'][i] == 'neutral':
                neutral[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'positive':
                positive[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'negative':
                negative[asp_slice['category'][i]] +=1
            if asp_slice['sentiment'][i] == 'both':
                both[asp_slice['category'][i]] +=1
        sent_slice = df_sent.loc[df_sent['text_id']==j]
        sent_slice = sent_slice.reset_index()
        sent_slice['positive'] = 0
        sent_slice['negative'] = 0
        sent_slice['both'] = 0
        sent_slice['neutral'] = 0
        for i in range(5):
            sent_slice.at[i,'positive'] = positive[sent_slice['category'][i]]
            sent_slice.at[i, 'negative'] = negative[sent_slice['category'][i]]
            sent_slice.at[i, 'both'] = both[sent_slice['category'][i]]
            sent_slice.at[i, 'neutral'] = neutral[sent_slice['category'][i]]
        frames.append(sent_slice)
    return pd.concat(frames)

In [6]:
train_asp = pd.read_csv(
    'train_split_aspects.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
train_sent = pd.read_csv(
    'train_split_cats.txt',
    delimiter='\t',
    names=['text_id', 'category', 'sentiment'])
result = count_sentiments(train_asp, train_sent)

In [10]:
dev_cats = pd.read_csv('dev_cats.txt', delimiter='\t', names=['text_id', 'category', 'sentiment'])
d_asp = pd.read_csv('devpred_aspects_ours.txt',  delimiter='\t', names=['text_id', 'category', 'text', 'start', 'end', 'sentiment'])
dev_cats = count_sentiments(d_asp, dev_cats)

In [11]:
train = result
test = dev_cats
dev_cats

Unnamed: 0,index,text_id,category,sentiment,positive,negative,both,neutral
0,210,785,Food,both,1,1,0,0
1,211,785,Interior,positive,1,0,0,0
2,212,785,Price,negative,0,0,0,0
3,213,785,Whole,both,1,0,0,0
4,214,785,Service,positive,2,0,0,0
...,...,...,...,...,...,...,...,...
0,315,38299,Food,positive,8,0,0,0
1,316,38299,Interior,positive,3,0,0,0
2,317,38299,Price,absence,0,0,0,0
3,318,38299,Whole,positive,1,0,0,1


In [12]:
result

Unnamed: 0,index,text_id,category,sentiment,positive,negative,both,neutral
0,0,30808,Food,positive,2,0,0,0
1,1,30808,Interior,positive,1,0,0,1
2,2,30808,Price,positive,1,0,0,0
3,3,30808,Whole,positive,2,0,0,1
4,4,30808,Service,positive,4,0,0,0
...,...,...,...,...,...,...,...,...
0,1060,16630,Food,positive,1,0,0,0
1,1061,16630,Interior,positive,1,0,0,0
2,1062,16630,Price,absence,0,0,0,0
3,1063,16630,Whole,positive,2,0,0,0


In [13]:
le = preprocessing.LabelEncoder()
y_train = le.fit_transform(train['sentiment'])

In [14]:
features_columns = ['positive', 'negative', 'both', 'neutral']

Будем использовать гридсерч на SVM, Decision Tree и KNN классификаторах

In [22]:
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import LinearSVC
clf = svm.SVC()
parameters_svm = {'kernel': ('linear', 'poly', 'rbf', 'sigmoid'),
                  'gamma': ('scale', 'auto'),
                  'shrinking': (True, False),
               'class_weight': ('balanced', None),
                 'decision_function_shape': ('ovo', 'ovr')}


grid_search_SVM = GridSearchCV(clf, parameters_svm, n_jobs=-1, scoring = "f1_weighted")
grid_search_SVM = grid_search_SVM.fit(train[features_columns], y_train)


print(grid_search_SVM.best_score_)
print(grid_search_SVM.best_params_)

0.8134237550305681
{'class_weight': 'balanced', 'decision_function_shape': 'ovo', 'gamma': 'scale', 'kernel': 'linear', 'shrinking': False}


In [23]:
y_test = le.transform(test['sentiment'])
y_pred = grid_search_SVM.predict(test[features_columns])

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7070422535211267

In [25]:
from sklearn.tree import DecisionTreeClassifier
param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [1, .1, .01, 0.12, 0.15, 0.17, 0.2, .001],
              'max_depth' : [3, 5, 6, 7, 8, 9, 10],
              'criterion' :['gini', 'entropy']
             }
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search_DTC = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, verbose=True)
grid_search_DTC.fit(train[features_columns], y_train)
print(grid_search_DTC.best_params_)
print(grid_search_DTC.best_score_)

Fitting 5 folds for each of 336 candidates, totalling 1680 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


{'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 9, 'max_features': 'auto'}
0.8178403755868546


[Parallel(n_jobs=1)]: Done 1680 out of 1680 | elapsed:    1.3s finished


In [26]:
y_pred = grid_search_DTC.predict(test[features_columns])
accuracy_score(y_test, y_pred)

0.7098591549295775

In [27]:
from sklearn.neighbors import KNeighborsClassifier
estimator_KNN = KNeighborsClassifier(algorithm='auto')
parameters_KNN = {
    'n_neighbors': (1,10, 1),
    'leaf_size': (20,40,1),
    'p': (1,2),
    'weights': ('uniform', 'distance'),
    'metric': ('minkowski', 'chebyshev')}

grid_search_KNN = GridSearchCV(
    estimator=estimator_KNN,
    param_grid=parameters_KNN,
    scoring = 'accuracy',
    n_jobs = -1,
    cv = 5)

grid_search_KNN.fit(train[features_columns], y_train)

print(grid_search_KNN.best_params_)
print(grid_search_KNN.best_score_)

{'leaf_size': 20, 'metric': 'minkowski', 'n_neighbors': 10, 'p': 2, 'weights': 'uniform'}
0.8131455399061032


In [28]:
y_pred = grid_search_KNN.predict(test[features_columns])
accuracy_score(y_test, y_pred)

0.7323943661971831

Лучшие результаты у KNN

In [32]:
model = grid_search_KNN.best_estimator_

In [33]:
model.fit(train[features_columns], y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=20, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

In [34]:
y_pred = model.predict(test[features_columns])
accuracy_score(y_test, y_pred)

0.7323943661971831

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['absence', 'both', 'negative', 'neutral', 'positive']))

              precision    recall  f1-score   support

     absence       0.72      0.93      0.81        59
        both       0.33      0.18      0.24        38
    negative       0.67      0.06      0.10        36
     neutral       0.00      0.00      0.00         8
    positive       0.77      0.92      0.84       214

    accuracy                           0.73       355
   macro avg       0.50      0.42      0.40       355
weighted avg       0.69      0.73      0.68       355



  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(y_test, y_pred, labels=[0,1,2,3,4]))

[[[275  21]
  [  4  55]]

 [[303  14]
  [ 31   7]]

 [[318   1]
  [ 34   2]]

 [[347   0]
  [  8   0]]

 [[ 82  59]
  [ 18 196]]]


In [41]:
mark_asp = pd.read_csv(
    'dev_aspects_sentiment_pred.txt', 
    delimiter='\t', 
    names=['text_id', 'category', 'mention', 'start', 'end', 'sentiment']
)
mark_cats = pd.read_csv('dev_cats.txt', delimiter='\t', names=['text_id', 'category', 'sentiment'])
makring = count_sentiments(mark_asp, mark_cats)

In [42]:
mark_asp['sentiment'].value_counts()

positive    1005
neutral       89
negative      43
Name: sentiment, dtype: int64

In [43]:
y_pred = model.predict(makring[features_columns])

In [44]:
def get_markings(pred_df, y_pred):
    in_df = pred_df
    y_pred = le.inverse_transform(y_pred)
    pred_df['sentiment'] = 0
    pred_df['sentiment'] = y_pred
    pred_df.drop(['positive', 'negative', 'both', 'neutral', 'index'], axis=1, inplace=True)
    return pred_df

In [45]:
output = get_markings(makring, y_pred)

In [46]:
output

Unnamed: 0,text_id,category,sentiment
0,785,Food,both
1,785,Interior,positive
2,785,Price,absence
3,785,Whole,positive
4,785,Service,positive
...,...,...,...
0,38299,Food,positive
1,38299,Interior,positive
2,38299,Price,absence
3,38299,Whole,positive


In [47]:
output.to_csv('dev_cats_output.txt', sep='\t', header=False, index=False)