In [42]:
import multiprocessing
import itertools
import warnings
from tqdm import tqdm_notebook, tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
import string
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix

pd.set_option('max_colwidth', 999)
max_cpu_count = multiprocessing.cpu_count()
warnings.filterwarnings('ignore')

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Владимир\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def drop_punctuation(row):
    row['speech'] = row['speech'].translate(str.maketrans('', '', string.punctuation))
    return row

def drop_stop_words(row):
    tokens = row['speech'].split()
    dropped_tokens = [word for word in tokens if word.lower() not in stop_words]
    row['speech'] = ' '.join(dropped_tokens)
    return row

def labels_transform(row):
    if row['evaluation'] == '+':
        row['evaluation'] = 1
    elif row['evaluation'] == '-':
        row['evaluation'] = 2
    else:
        row['evaluation'] = 3
    return row

def not_join(row):
    row['speech'] = row['speech'].replace('не ', 'не')
    row['speech'] = row['speech'].replace('Не ', 'Не')
    return row
    


stop_words = set(stopwords.words('russian'))

In [37]:
train = pd.read_excel('train.xlsx')[['speech', 'evaluation']]
train = train.loc[(train['evaluation'] == '+') | (train['evaluation'] == '-') | (train['evaluation'] == '0')]

test = pd.read_excel('test.xlsx')[['speech', 'evaluation']]
test = test.loc[(test['evaluation'] == '+') | (test['evaluation'] == '-') | (test['evaluation'] == '0')]

In [38]:
data = pd.concat((train, test))
data_size = len(data)
split_coef = int(data_size * 0.8)
train = data.iloc[:split_coef]
test = data.iloc[split_coef:]

In [54]:
class Model:
    def __init__(self, train, test, 
                 drop_punctuation_=True, 
                 drop_stop_words_=True,
                 not_join_=True,
                 data_model_type=None, 
                 classifier='Naive'):
        self.train = train
        self.test = test     
        if drop_punctuation_:
            self.train = self.train.apply(drop_punctuation, axis=1)
            self.test = self.test.apply(drop_punctuation, axis=1)       
        if drop_stop_words_:
            self.train = self.train.apply(drop_stop_words, axis=1)        
            self.test = self.test.apply(drop_stop_words, axis=1)  
        if not_join_:
            self.train = self.train.apply(not_join, axis=1)
            self.test = self.test.apply(not_join, axis=1)
        self.classifier = classifier
        if data_model_type is None or classifier is None:
            raise ValueError('None prameters are forbidden')       
        self.data_model_type = data_model_type
        self.data_model = None
        self.model = None
        
    def data_model_processing(self):
        self.train_y = np.array(self.train.apply(labels_transform, axis=1)['evaluation'])
        self.test_y = np.array(self.test.apply(labels_transform, axis=1)['evaluation'])
        if self.data_model_type in ['cv', 'tfidf', 'boolean']:
            CV = CountVectorizer()
            self.train_X = CV.fit_transform(self.train['speech'])
            self.test_X = CV.transform(self.test['speech'])
            if self.data_model_type == 'boolean':
                self.train_X = csr_matrix(np.sign(self.train_X.toarray()))
                self.test_X = csr_matrix(np.sign(self.test_X.toarray()))
                
            if self.data_model_type == 'tfidf':
                tfidf = TfidfTransformer()
                self.train_X = tfidf.fit_transform(self.train_X)
                self.test_X = tfidf.transform(self.test_X)
    def fit_model(self):
        if self.classifier == 'Naive':
            self.model = MultinomialNB()
            self.model.fit(self.train_X, self.train_y)

        if self.classifier =='SVM':
#             parameters = {
#                 'kernel' : ('linear', 'rbf', 'poly'), 
#                 'C' : [1, 10],
#                 'gamma' : np.arange(0.1, 1, 0.1),
#                 'degree' : [2, 3, 4]
#             }
            parameters = {
                'kernel' : ('linear', 'rbf'), 
                'C' : [1, 10],
                'gamma' : [0.1, 0.4]
            }
            
            self.svm_clf = SVC()
            self.model = GridSearchCV(self.svm_clf, parameters, cv=None, verbose=1, n_jobs=max_cpu_count)
            
            self.model.fit(self.train_X, self.train_y)
                  
    def predict_model(self):
        self.classification_results = {}
        self.classification_results['prediction'] = self.model.predict(self.test_X)
        if self.classifier == 'SVM':
            self.classification_results['svm_best_model'] = self.model.best_params_
     
    def prediction(self):
        return self.classification_results['prediction']
    
    def classification_summary(self):
        target_names = ['positive', 'negative', 'neutral']
        d = classification_report(self.test_y, self.prediction(), target_names=target_names, output_dict=True)
        s = classification_report(self.test_y, self.prediction(), target_names=target_names, output_dict=False)
        return d, s


In [55]:
class Executor:
    def __init__(self):
        self.drop_stop_words = [True, False]
        self.not_join = [True, False]
        self.data_model_type = ['cv', 'tfidf', 'boolean']
        self.classifier = ['Naive', 'SVM']
        self.product = []
        for element in itertools.product(self.drop_stop_words, self.data_model_type, self.not_join, self.classifier):
            self.product.append(element)
        self.report = pd.DataFrame(columns=['parameters', 'accuracy', 'summary', 'model_details'])
    def create_models(self):
        print('Available model parameters:')
        print('    drop_stop_words with values: {}'.format(self.drop_stop_words))
        print('    not_join with values: {}'.format(self.not_join))
        print('    data_model_type with values: {}'.format(self.data_model_type))
        print('    classifier with values: {}'.format(self.classifier))
        print('creating models...')
        self.models = []
        for param_set in tqdm_notebook(self.product):
            self.models.append(Model(train, test, 
                                     drop_punctuation_=True, 
                                     drop_stop_words_=param_set[0],
                                     data_model_type=param_set[1],
                                     not_join_=param_set[2],
                                     classifier=param_set[3]))
        print('{} models created'.format(len(self.product)))
    def fit_models(self):
        print('fitting models...')
        for i, model in tqdm_notebook(enumerate(self.models)):
            model.data_model_processing()
            model.fit_model()
            model.predict_model()
            
            acc = accuracy_score(model.prediction(), model.test_y)
            to_insert = pd.Series(index=self.report.columns)
            to_insert['parameters'] = self.product[i]
            to_insert['accuracy'] = acc
            to_insert['summary'] = model.classification_summary()[1]
            to_insert['model_details'] = '-'
            
            if self.product[i][3] == 'SVM':
                to_insert['model_details'] = model.classification_results['svm_best_model']
            self.report = self.report.append(to_insert, ignore_index=True)
        self.report = self.report.sort_values(by=['accuracy'], ascending=False)
        
    def summary(self):
        return self.report

In [56]:
e = Executor()
e.create_models()
e.fit_models()
e.summary()

Available model parameters:
    drop_stop_words with values: [True, False]
    not_join with values: [True, False]
    data_model_type with values: ['cv', 'tfidf', 'boolean']
    classifier with values: ['Naive', 'SVM']
creating models...



24 models created
fitting models...


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   33.6s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   33.1s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   32.8s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   33.1s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   33.1s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   33.3s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   40.6s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   41.0s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   39.9s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   40.1s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   40.4s finished


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  24 out of  24 | elapsed:   40.8s finished





Unnamed: 0,parameters,accuracy,summary,model_details
14,"(False, cv, False, Naive)",0.643447,precision recall f1-score support\n\n positive 0.56 0.73 0.63 475\n negative 0.71 0.82 0.76 766\n neutral 0.62 0.26 0.36 453\n\n micro avg 0.64 0.64 0.64 1694\n macro avg 0.63 0.60 0.59 1694\nweighted avg 0.64 0.64 0.62 1694\n,-
12,"(False, cv, True, Naive)",0.637544,precision recall f1-score support\n\n positive 0.54 0.73 0.62 475\n negative 0.72 0.80 0.76 766\n neutral 0.60 0.26 0.36 453\n\n micro avg 0.64 0.64 0.64 1694\n macro avg 0.62 0.60 0.58 1694\nweighted avg 0.64 0.64 0.61 1694\n,-
22,"(False, boolean, False, Naive)",0.635773,precision recall f1-score support\n\n positive 0.54 0.73 0.62 475\n negative 0.71 0.80 0.75 766\n neutral 0.63 0.25 0.35 453\n\n micro avg 0.64 0.64 0.64 1694\n macro avg 0.62 0.60 0.58 1694\nweighted avg 0.64 0.64 0.61 1694\n,-
20,"(False, boolean, True, Naive)",0.631641,precision recall f1-score support\n\n positive 0.54 0.73 0.62 475\n negative 0.71 0.80 0.75 766\n neutral 0.61 0.24 0.34 453\n\n micro avg 0.63 0.63 0.63 1694\n macro avg 0.62 0.59 0.57 1694\nweighted avg 0.63 0.63 0.61 1694\n,-
19,"(False, tfidf, False, SVM)",0.62928,precision recall f1-score support\n\n positive 0.56 0.68 0.61 475\n negative 0.71 0.78 0.75 766\n neutral 0.53 0.31 0.39 453\n\n micro avg 0.63 0.63 0.63 1694\n macro avg 0.60 0.59 0.58 1694\nweighted avg 0.62 0.63 0.61 1694\n,"{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}"
17,"(False, tfidf, True, SVM)",0.626919,precision recall f1-score support\n\n positive 0.55 0.68 0.61 475\n negative 0.71 0.78 0.74 766\n neutral 0.53 0.31 0.39 453\n\n micro avg 0.63 0.63 0.63 1694\n macro avg 0.60 0.59 0.58 1694\nweighted avg 0.62 0.63 0.61 1694\n,"{'C': 1, 'gamma': 0.1, 'kernel': 'linear'}"
10,"(True, boolean, False, Naive)",0.626328,precision recall f1-score support\n\n positive 0.54 0.75 0.63 475\n negative 0.72 0.75 0.73 766\n neutral 0.56 0.28 0.37 453\n\n micro avg 0.63 0.63 0.63 1694\n macro avg 0.61 0.60 0.58 1694\nweighted avg 0.63 0.63 0.61 1694\n,-
2,"(True, cv, False, Naive)",0.626328,precision recall f1-score support\n\n positive 0.54 0.75 0.63 475\n negative 0.72 0.75 0.73 766\n neutral 0.54 0.28 0.37 453\n\n micro avg 0.63 0.63 0.63 1694\n macro avg 0.60 0.60 0.58 1694\nweighted avg 0.62 0.63 0.61 1694\n,-
8,"(True, boolean, True, Naive)",0.623967,precision recall f1-score support\n\n positive 0.53 0.74 0.62 475\n negative 0.72 0.76 0.74 766\n neutral 0.56 0.28 0.37 453\n\n micro avg 0.62 0.62 0.62 1694\n macro avg 0.60 0.59 0.58 1694\nweighted avg 0.62 0.62 0.61 1694\n,-
0,"(True, cv, True, Naive)",0.620425,precision recall f1-score support\n\n positive 0.54 0.74 0.63 475\n negative 0.71 0.75 0.73 766\n neutral 0.53 0.27 0.35 453\n\n micro avg 0.62 0.62 0.62 1694\n macro avg 0.59 0.59 0.57 1694\nweighted avg 0.61 0.62 0.60 1694\n,-


In [None]:
print(e.summary().loc[...]['summary'])