In [75]:
import multiprocessing
import itertools
import warnings
from tqdm import tqdm_notebook, tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
import string
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
import pymorphy2
import operator
pd.set_option('max_colwidth', 999)
max_cpu_count = multiprocessing.cpu_count()
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Boris
[nltk_data]     Feldman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [42]:
def drop_punctuation(row):
    row['speech'] = row['speech'].translate(str.maketrans('', '', string.punctuation))
    return row

def drop_stop_words(row):
    tokens = row['speech'].split()
    dropped_tokens = [word for word in tokens if word.lower() not in stop_words]
    row['speech'] = ' '.join(dropped_tokens)
    return row

def labels_transform(row):
    if row['evaluation'] == '+':
        row['evaluation'] = 1
    elif row['evaluation'] == '-':
        row['evaluation'] = 2
    else:
        row['evaluation'] = 3
    return row

def not_join(row):
    row['speech'] = row['speech'].replace('не ', 'не')
    row['speech'] = row['speech'].replace('Не ', 'Не')
    return row

morph = pymorphy2.MorphAnalyzer()
def normalization(row):
    tokens = row['speech'].split()
    new_speech = []
    for token in tokens:
        try:
            new_speech.append(morph.parse(token)[0].normal_form)
        except:
            new_speech.append(token) 
    row['speech'] = ' '.join(new_speech)
    return row
    


stop_words = set(stopwords.words('russian'))

In [98]:
train = pd.read_excel('train.xlsx')[['speech', 'evaluation']]
train = train.loc[(train['evaluation'] == '+') | (train['evaluation'] == '-') | (train['evaluation'] == '0')]

test = pd.read_excel('test.xlsx')[['speech', 'evaluation']]
test = test.loc[(test['evaluation'] == '+') | (test['evaluation'] == '-') | (test['evaluation'] == '0')]

normalized_train = train.apply(normalization, axis=1)
normalized_test = test.apply(normalization, axis=1)

train = pd.read_excel('train.xlsx')[['speech', 'evaluation']]
train = train.loc[(train['evaluation'] == '+') | (train['evaluation'] == '-') | (train['evaluation'] == '0')]

test = pd.read_excel('test.xlsx')[['speech', 'evaluation']]
test = test.loc[(test['evaluation'] == '+') | (test['evaluation'] == '-') | (test['evaluation'] == '0')]

In [5]:
# data = pd.concat((train, test))
# data_size = len(data)
# split_coef = int(data_size * 0.8)
# train = data.iloc[:split_coef]
# test = data.iloc[split_coef:]

In [102]:
class Model:
    def __init__(self, train, test, 
                 drop_punctuation_=True, 
                 drop_stop_words_=True,
                 not_join_=True,
                 data_model_type=None, 
                 classifier='Naive'):
        self.train = train
        self.test = test     
        if drop_punctuation_:
            self.train = self.train.apply(drop_punctuation, axis=1)
            self.test = self.test.apply(drop_punctuation, axis=1) 
#         if normalization_:
#             self.train = self.train.apply(normalization, axis=1)
#             self.test = self.test.apply(normalization, axis=1)
        if not_join_:
            self.train = self.train.apply(not_join, axis=1)
            self.test = self.test.apply(not_join, axis=1)
        if drop_stop_words_:
            self.train = self.train.apply(drop_stop_words, axis=1)        
            self.test = self.test.apply(drop_stop_words, axis=1)
        self.classifier = classifier
        if data_model_type is None or classifier is None:
            raise ValueError('None prameters are forbidden')       
        self.data_model_type = data_model_type
        self.data_model = None
        self.model = None
        
    def data_model_processing(self):
        self.train_y = np.array(self.train.apply(labels_transform, axis=1)['evaluation'])
        self.test_y = np.array(self.test.apply(labels_transform, axis=1)['evaluation'])
        if self.data_model_type in ['cv', 'tfidf', 'boolean']:
            CV = CountVectorizer()
            self.train_X = CV.fit_transform(self.train['speech'])
            self.test_X = CV.transform(self.test['speech'])
            if self.data_model_type == 'boolean':
                self.train_X = csr_matrix(np.sign(self.train_X.toarray()))
                self.test_X = csr_matrix(np.sign(self.test_X.toarray()))
                
            if self.data_model_type == 'tfidf':
                tfidf = TfidfTransformer()
                self.train_X = tfidf.fit_transform(self.train_X)
                self.test_X = tfidf.transform(self.test_X)
    def fit_model(self):
        if self.classifier == 'Naive':
            parameters = {
                'alpha' : np.arange(0.1, 1.1, 0.1),
                'fit_prior' : [True, False]
            }
            
            self.nai_clf = MultinomialNB()
            self.model = GridSearchCV(self.nai_clf, parameters, cv=2, verbose=0, n_jobs=max_cpu_count)
            self.model.fit(self.train_X, self.train_y)

        if self.classifier =='SVM':
            parameters = {
                'kernel' : ('linear', 'rbf'), 
                'C' : [1, 10],
                'gamma' : [0.1, 0.4]
            }
            
            self.svm_clf = SVC()
            self.model = GridSearchCV(self.svm_clf, parameters, cv=2, verbose=0, n_jobs=max_cpu_count)
            
            self.model.fit(self.train_X, self.train_y)
                  
    def predict_model(self):
        self.classification_results = {}
        self.classification_results['prediction'] = self.model.predict(self.test_X)
        if self.classifier == 'Naive':
            self.classification_results['naive_best_model'] = self.model.best_params_
        if self.classifier == 'SVM':
            self.classification_results['svm_best_model'] = self.model.best_params_
     
    def prediction(self):
        return self.classification_results['prediction']
    
    def classification_summary(self):
        target_names = ['positive', 'negative', 'neutral']
        d = classification_report(self.test_y, self.prediction(), target_names=target_names, output_dict=True)
        s = classification_report(self.test_y, self.prediction(), target_names=target_names, output_dict=False)
        return d, s


In [103]:
class Executor:
    def __init__(self):
        self.drop_stop_words = [True, False]
        self.not_join = [True, False]
        self.data_model_type = ['cv', 'tfidf', 'boolean']
        self.classifier = ['Naive', 'SVM']
        self.normalization = [True, False]
        self.product = []
        for element in itertools.product(self.drop_stop_words, 
                                         self.data_model_type, 
                                         self.not_join, 
                                         self.classifier, 
                                         self.normalization):
            self.product.append(element)
        self.report = pd.DataFrame(columns=['parameters', 'accuracy', 'summary', 'model_details', 'dict_summary'])
    def create_models(self):
        print('Available model parameters:')
        print('    drop_stop_words with values: {}'.format(self.drop_stop_words))
        print('    not_join with values: {}'.format(self.not_join))
        print('    data_model_type with values: {}'.format(self.data_model_type))
        print('    classifier with values: {}'.format(self.classifier))
        print('    normalization with values: {}'.format(self.normalization))
        print('creating models...')
        self.models = []
        for param_set in tqdm_notebook(self.product):
            if param_set[4]:
                self.models.append(Model(normalized_train, normalized_test, 
                                     drop_punctuation_=True, 
                                     drop_stop_words_=param_set[0],
                                     data_model_type=param_set[1],
                                     not_join_=param_set[2],
                                     classifier=param_set[3]))
            else:
                self.models.append(Model(train, test, 
                                     drop_punctuation_=True, 
                                     drop_stop_words_=param_set[0],
                                     data_model_type=param_set[1],
                                     not_join_=param_set[2],
                                     classifier=param_set[3]))
        print('{} models created'.format(len(self.product)))
    def fit_models(self):
        print('fitting models...')
        for i, model in tqdm_notebook(enumerate(self.models)):
            model.data_model_processing()
            model.fit_model()
            model.predict_model()
            
            acc = accuracy_score(model.prediction(), model.test_y)
            to_insert = pd.Series(index=self.report.columns)
            to_insert['parameters'] = self.product[i]
            to_insert['accuracy'] = acc
            to_insert['summary'] = model.classification_summary()[1]
            to_insert['dict_summary'] = model.classification_summary()[0]
            to_insert['model_details'] = '-'
            if self.product[i][3] == 'Naive':
                to_insert['model_details'] = model.classification_results['naive_best_model']
            if self.product[i][3] == 'SVM':
                to_insert['model_details'] = model.classification_results['svm_best_model']
            self.report = self.report.append(to_insert, ignore_index=True)
        self.report = self.report.sort_values(by=['accuracy'], ascending=False)
        
    def summary(self):
        return self.report

In [104]:
e = Executor()
e.create_models()
e.fit_models()

Available model parameters:
    drop_stop_words with values: [True, False]
    not_join with values: [True, False]
    data_model_type with values: ['cv', 'tfidf', 'boolean']
    classifier with values: ['Naive', 'SVM']
    normalization with values: [True, False]
creating models...


HBox(children=(IntProgress(value=0, max=48), HTML(value='')))

48 models created
fitting models...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [112]:
e.summary()[['parameters', 'accuracy', 'model_details']]

Unnamed: 0,parameters,accuracy,model_details
28,"(False, cv, False, Naive, True)",0.624535,"{'alpha': 0.9, 'fit_prior': False}"
44,"(False, boolean, False, Naive, True)",0.623005,"{'alpha': 0.7000000000000001, 'fit_prior': False}"
40,"(False, boolean, True, Naive, True)",0.622786,"{'alpha': 1.0, 'fit_prior': False}"
36,"(False, tfidf, False, Naive, True)",0.621474,"{'alpha': 0.30000000000000004, 'fit_prior': False}"
24,"(False, cv, True, Naive, True)",0.619943,"{'alpha': 0.7000000000000001, 'fit_prior': True}"
32,"(False, tfidf, True, Naive, True)",0.618412,"{'alpha': 0.4, 'fit_prior': False}"
16,"(True, boolean, True, Naive, True)",0.617975,"{'alpha': 1.0, 'fit_prior': True}"
0,"(True, cv, True, Naive, True)",0.616007,"{'alpha': 0.8, 'fit_prior': True}"
4,"(True, cv, False, Naive, True)",0.61382,"{'alpha': 1.0, 'fit_prior': False}"
20,"(True, boolean, False, Naive, True)",0.613383,"{'alpha': 0.9, 'fit_prior': True}"


In [110]:
f1_sort = {}
for row in e.summary().values:
    f1_sort[row[0]] =  row[4]['micro avg']['f1-score']
f1_sort = sorted(f1_sort.items(), key=operator.itemgetter(1), reverse=True)
f1_sort[:]

[((False, 'cv', False, 'Naive', True), 0.6245353159851301),
 ((False, 'boolean', False, 'Naive', True), 0.6230045921714411),
 ((False, 'boolean', True, 'Naive', True), 0.622785917340914),
 ((False, 'tfidf', False, 'Naive', True), 0.621473868357752),
 ((False, 'cv', True, 'Naive', True), 0.619943144544063),
 ((False, 'tfidf', True, 'Naive', True), 0.618412420730374),
 ((True, 'boolean', True, 'Naive', True), 0.6179750710693199),
 ((True, 'cv', True, 'Naive', True), 0.6160069975945769),
 ((True, 'cv', False, 'Naive', True), 0.6138202492893068),
 ((True, 'boolean', False, 'Naive', True), 0.6133828996282528),
 ((True, 'tfidf', True, 'Naive', True), 0.6094467526787667),
 ((False, 'tfidf', False, 'SVM', True), 0.6074786792040237),
 ((True, 'tfidf', False, 'Naive', True), 0.6063853050513885),
 ((False, 'tfidf', True, 'SVM', True), 0.6022304832713755),
 ((True, 'tfidf', True, 'SVM', True), 0.6002624097966324),
 ((False, 'tfidf', True, 'Naive', False), 0.5974196369997813),
 ((False, 'cv', False

In [111]:
f1_sort = {}
for row in e.summary().values:
    f1_sort[row[0]] =  row[4]['macro avg']['f1-score']
f1_sort = sorted(f1_sort.items(), key=operator.itemgetter(1), reverse=True)
f1_sort[:]



[((False, 'boolean', False, 'Naive', True), 0.5842290012957877),
 ((False, 'cv', False, 'Naive', True), 0.5770866681368355),
 ((False, 'tfidf', False, 'Naive', True), 0.5766449234169919),
 ((True, 'cv', False, 'Naive', True), 0.574294435231017),
 ((False, 'cv', True, 'Naive', True), 0.5736930222044978),
 ((True, 'cv', True, 'Naive', True), 0.573190571653205),
 ((False, 'boolean', True, 'Naive', True), 0.571773242029494),
 ((True, 'boolean', False, 'Naive', True), 0.565955389195074),
 ((True, 'boolean', True, 'Naive', True), 0.5646032912711813),
 ((False, 'tfidf', True, 'Naive', True), 0.5633633344104384),
 ((False, 'tfidf', False, 'SVM', True), 0.5611761231339824),
 ((False, 'tfidf', True, 'SVM', True), 0.5611619426162013),
 ((True, 'tfidf', True, 'SVM', True), 0.5599068525208737),
 ((True, 'tfidf', True, 'Naive', True), 0.5581977599277891),
 ((True, 'tfidf', False, 'SVM', True), 0.5555112984930588),
 ((True, 'boolean', True, 'Naive', False), 0.5498701622770469),
 ((True, 'tfidf', Fals

In [61]:
print(e.summary().iloc[0]['summary'])

              precision    recall  f1-score   support

    positive       0.59      0.70      0.64      1448
    negative       0.66      0.80      0.72      1890
     neutral       0.59      0.27      0.37      1235

   micro avg       0.62      0.62      0.62      4573
   macro avg       0.61      0.59      0.58      4573
weighted avg       0.62      0.62      0.60      4573

