In [1]:
%matplotlib inline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.cross_validation import KFold
import scipy as sp
import pandas as pd
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
import pymystem3

In [2]:
# В этой клетке задается функция part_speech, которая по входному массиву-результату работы mystem analyse возвращает тупл,
# к котром первый элемент векторм из 9 элементов, каждому из которого соответствует количество одной из восьми частей речи
# в массиве, а второй - это сам массив, в котором слова заменяны на пронумерованные части речи
# Функции part_vec и part_seq возвращают первый и второй элементы тупла

def part_speech(a):
    v = [0] * 9
    res = []
    for i in range(0, len(a), 2):
        try:
            name = a[i]['analysis'][0]['gr']
        except:
            name = 'OOOU'
            res.append('OOOU')
        if name[0] == 'S':
            res.append(0)
            v[0] += 1
        elif name[:3] == 'ADV' or name[:6] == 'ADVPRO':           
            res.append(1)
            v[1] += 1
        elif name[0] == 'A' or name[:4] == 'ANUM' or name[:4] == 'APRO':            
            res.append(2)
            v[2] += 1
        elif name[:2] == 'PR':            
            res.append(3)
            v[3] += 1
        elif name[:4] == 'CONJ':            
            res.append(4)
            v[4] += 1
        elif name[:4] == 'PART':            
            res.append(5)
            v[5] += 1
        elif name[0] == 'V':            
            res.append(6)
            v[6] += 1
        elif name.find('NUM') != -1:            
            res.append(7)
            v[7] += 1
        elif name[:4] == 'INTJ':            
            res.append(8)
            v[8] += 1
    
    return (v, res)
        
def part_vec(a):
    return part_speech(a)[0]

def part_seq(a):
    return part_speech(a)[1]

In [3]:
# В этой клетке считывается датасет, все предложения превращаются в строку, разделенную запятыми по исходным предложениям,
# применяется майстем анализ, результат которого дальше разбивается опять на предложения
# далее формируются столбцы датафрейма - предложение-массив со словами, заменными частями речи, массив количества частей речи
# в предложении, количество слов в предложении


df = pd.read_csv("text.csv")
text = ''
for i in range(len(df)):
    text = text + df.iat[i,0] + ','

ms = pymystem3.Mystem()    
text_analysis = ms.analyze(text)

m = []

for i in range(len(text_analysis)):
    if str(text_analysis[i]) == '{\'text\': \',\'}':
        m.append(i)
        
text_analysis_array = [''] * len(df)
text_analysis_array[0] = text_analysis[:m[0]] 

for i in range(1, len(df) - 1):
    text_analysis_array[i] = text_analysis[m[i - 1] + 1 : m[i]] 
  
s = pd.Series(text_analysis_array, name='prvec_prev')

df['prvec_prev'] = s
df['pvec'] = df['prvec_prev'].to_frame().applymap(part_vec)
df['pseq'] = df['prvec_prev'].to_frame().applymap(part_seq)
del df['prvec_prev']

for i in range(9):
    df[i] = df['pvec'].to_frame().applymap(lambda x: x[i])
    
df['len'] = df['pseq'].to_frame().applymap(lambda x: len(x))

In [4]:
# в этой клетке задается функция, подчитывающая количество встречающихся пар в предложении
def find_neib(a, n, m):
    res = 0
    for k in range(len(a) - 1):
        if a[k : k + 2] == [n, m]:
            res += 1
    return res

In [5]:
def find_neib3(a, n, m, l):
    res = 0
    for k in range(len(a) - 2):
        
        if a[k : k + 3] == [n, m, l]:
            res += 1
    return res

In [6]:
#здесь заводятся столбцы датафрейма - количество соседствствующих пар частей речи в предложении
feat_pair = [0, 1, 2, 3, 4, 5, 6, 7, 8]
for i in feat_pair:
    for j in feat_pair:
        df[str(i) + str(j)] = df['pseq'].to_frame().applymap(lambda x: find_neib(x, i, j))

In [7]:
df.head()

Unnamed: 0,txt,is_natural,pvec,pseq,0,1,2,3,4,5,...,78,80,81,82,83,84,85,86,87,88
0,вдогонку спрашивать он,1.0,"[1, 1, 0, 0, 0, 0, 1, 0, 0]","[1, 6, 0]",1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,почему вы всегда всегда правый,1.0,"[1, 3, 1, 0, 0, 0, 0, 0, 0]","[1, 0, 1, 1, 2]",1,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,же,0.0,"[0, 0, 0, 0, 0, 1, 0, 0, 0]",[5],0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,зависть не зависть но нечто болезненный шевели...,1.0,"[4, 0, 1, 1, 1, 1, 1, 0, 0]","[0, 5, 0, 4, 0, 2, 6, 3, 0]",4,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,с обратимый пленка,1.0,"[1, 0, 1, 1, 0, 0, 0, 0, 0]","[3, 2, 0]",1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# в этой клетке указываются фичи, которые мы будет использовать при классификации
frames = []
for i in range(9):
    frames.append(df[i])
    
for i in feat_pair:
    for j in feat_pair:
        frames.append(df[str(i) + str(j)])
X_text = pd.concat(frames, axis=1)
y = df['is_natural']

In [9]:
# тестирование модели svm
model = svm.SVC(random_state=42)

pipeline = Pipeline([
    ('classification', model),
])

n_fold = 3
param_grid = {'classification__C':[0.5, 1, 2, 5, 10, 100, 200], 'classification__degree':[2,3],'classification__kernel':['linear','poly','rbf']}     
cv = KFold(n=df.shape[0], n_folds=n_fold, shuffle=True, random_state=42)
    

clf = GridSearchCV(estimator=pipeline, cv=cv, param_grid=param_grid, n_jobs=4, verbose=1, scoring="accuracy")
clf.fit(X_text, y) 
print("\nBest parameters set found on development set:\n")
print(clf.best_params_)
print("\n\nGrid scores on development set:\n")
means = clf.grid_scores_
print(means)
scorer = clf.scorer_
print("\nUsed scorer:\t", scorer)
print("\nBest score:\t", clf.best_score_)

Fitting 3 folds for each of 42 candidates, totalling 126 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.7s
[Parallel(n_jobs=4)]: Done 126 out of 126 | elapsed:  1.6min finished



Best parameters set found on development set:

{'classification__degree': 2, 'classification__C': 2, 'classification__kernel': 'linear'}


Grid scores on development set:

[mean: 0.78339, std: 0.01317, params: {'classification__degree': 2, 'classification__kernel': 'linear', 'classification__C': 0.5}, mean: 0.70031, std: 0.03647, params: {'classification__degree': 2, 'classification__C': 0.5, 'classification__kernel': 'poly'}, mean: 0.75388, std: 0.01646, params: {'classification__degree': 2, 'classification__C': 0.5, 'classification__kernel': 'rbf'}, mean: 0.78339, std: 0.01317, params: {'classification__degree': 3, 'classification__C': 0.5, 'classification__kernel': 'linear'}, mean: 0.65528, std: 0.02947, params: {'classification__degree': 3, 'classification__kernel': 'poly', 'classification__C': 0.5}, mean: 0.75388, std: 0.01646, params: {'classification__degree': 3, 'classification__C': 0.5, 'classification__kernel': 'rbf'}, mean: 0.78261, std: 0.00867, params: {'classification__d

In [10]:
# тестирование модели градиентного бустинга
model = GradientBoostingClassifier(random_state=42)

pipeline = Pipeline([
 
    ('classification', model),
])

n_fold = 3
param_grid = {'classification__n_estimators':[50, 100, 150, 200, 250, 300], 'classification__max_depth':[2,3,4]}     
cv = KFold(n=df.shape[0], n_folds=n_fold, shuffle=True, random_state=42)
    

clf = GridSearchCV(estimator=pipeline, cv=cv, param_grid=param_grid, n_jobs=4, verbose=1, scoring="accuracy")
clf.fit(X_text, y) 
print("\nBest parameters set found on development set:\n")
print(clf.best_params_)
print("\n\nGrid scores on development set:\n")
means = clf.grid_scores_
print(means)
scorer = clf.scorer_
print("\nUsed scorer:\t", scorer)
print("\nBest score:\t", clf.best_score_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   17.0s
[Parallel(n_jobs=4)]: Done  54 out of  54 | elapsed:   24.0s finished



Best parameters set found on development set:

{'classification__n_estimators': 250, 'classification__max_depth': 2}


Grid scores on development set:

[mean: 0.77096, std: 0.00878, params: {'classification__n_estimators': 50, 'classification__max_depth': 2}, mean: 0.77950, std: 0.01618, params: {'classification__n_estimators': 100, 'classification__max_depth': 2}, mean: 0.79037, std: 0.01063, params: {'classification__n_estimators': 150, 'classification__max_depth': 2}, mean: 0.78882, std: 0.01058, params: {'classification__n_estimators': 200, 'classification__max_depth': 2}, mean: 0.79270, std: 0.01000, params: {'classification__n_estimators': 250, 'classification__max_depth': 2}, mean: 0.79270, std: 0.00877, params: {'classification__n_estimators': 300, 'classification__max_depth': 2}, mean: 0.78028, std: 0.01899, params: {'classification__n_estimators': 50, 'classification__max_depth': 3}, mean: 0.78416, std: 0.01792, params: {'classification__n_estimators': 100, 'classification__

In [11]:
# тестирование модели логистической регрессии
model = LogisticRegression(random_state=42)

pipeline = Pipeline([
 
    ('classification', model),
])

n_fold = 3
param_grid = {'classification__C':[50, 100, 150, 200, 250, 300],'classification__class_weight':['balanced', None], 'classification__penalty':['l1','l2']}     
cv = KFold(n=df.shape[0], n_folds=n_fold, shuffle=True, random_state=42)
    

clf = GridSearchCV(estimator=pipeline, cv=cv, param_grid=param_grid, n_jobs=4, verbose=1, scoring="accuracy")
clf.fit(X_text, y) 
print("\nBest parameters set found on development set:\n")
print(clf.best_params_)
print("\n\nGrid scores on development set:\n")
means = clf.grid_scores_
print(means)
scorer = clf.scorer_
print("\nUsed scorer:\t", scorer)
print("\nBest score:\t", clf.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.2s
[Parallel(n_jobs=4)]: Done  72 out of  72 | elapsed:   23.7s finished



Best parameters set found on development set:

{'classification__class_weight': 'balanced', 'classification__C': 200, 'classification__penalty': 'l1'}


Grid scores on development set:

[mean: 0.79658, std: 0.01597, params: {'classification__class_weight': 'balanced', 'classification__C': 50, 'classification__penalty': 'l1'}, mean: 0.79658, std: 0.01696, params: {'classification__class_weight': 'balanced', 'classification__C': 50, 'classification__penalty': 'l2'}, mean: 0.79503, std: 0.01606, params: {'classification__class_weight': None, 'classification__C': 50, 'classification__penalty': 'l1'}, mean: 0.79503, std: 0.01692, params: {'classification__class_weight': None, 'classification__C': 50, 'classification__penalty': 'l2'}, mean: 0.79658, std: 0.01597, params: {'classification__class_weight': 'balanced', 'classification__C': 100, 'classification__penalty': 'l1'}, mean: 0.79581, std: 0.01598, params: {'classification__class_weight': 'balanced', 'classification__C': 100, 'classific