In [1]:
import pandas as pd
import numpy as np
import os
%matplotlib inline
import matplotlib.pyplot as plt
import IPython.display as ipd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go
import seaborn as sns
from numpy.random import seed
from numpy.random import randn
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings('ignore')
seed(1)

pd.set_option('display.max_rows', 500)

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('news.words', sep='\t', header=None)

In [3]:
# добавляем фичи
def make_preparations (df):
    df.rename(columns={0: 'Start', 1: 'End', 2: 'Word', 3: 'Probability'}, inplace=True)

    # длительность
    df['Duration'] = df['End'] - df['Start']
    # длина слова
    df['Length'] = df['Word'].apply(lambda x: len(x)).astype('int64')

    def get_vowel (w):
        i = 0
        for j in w:
            if j in ['а','и','у','э','о','ы','я','ю','е','ё']:
                i += 1
        if i == 0:
            i = 1
        return i
    # количество слогов
    df['Syllable'] = df['Word'].apply(lambda x: get_vowel(x)).astype('int64')
    # темп произношения (буквы в секунду)
    df['Rate'] = df['Length'] / df['Duration']
    # тем произношения (слоги в секунду)
    df['Rate_syllable'] = df['Syllable'] / df['Duration']
    # пауза после слова
    df['Pause'] = np.nan

    for i in range(len(df)-1):
        df['Pause'][i] = df['Start'][i+1] - df['End'][i]
    # поле со знаками
    df['Sign'] = ''
    
    return df



In [4]:
def get_mov_av (df, field, n=5):
    df['Mov_avr_'+field] = np.nan
    df['Mov_avr_'+field][0:n] = df[field][0:n].mean()
    for i in range(n,len(df)):
        df.loc[i,'Mov_avr_'+field] = df[field][i-n:i].mean()
    df['Mov_avr_res_'+field] = df[field] - df['Mov_avr_'+field]
    df['Mov_avr_res_'+field][0:n] = 0
    return df

In [5]:
df = make_preparations (df)
#df = get_mov_av (df, 'Rate_syllable', n=4)


In [6]:
# вручную расставляем знаки
df['Sign'][7, 24, 33, 41, 54] = '.'
df['Sign'][12, 17, 18, 19, 29, 30, 31, 38, 39, 40, 48] = ','
df['Sign_fit'] = df['Sign'].transform(lambda x: 0 if x == '' or x == ',' else 1).astype('int64')

In [7]:
# последняя пауза - средняя по паузам со знаком точки
df['Pause'][len(df)-1] = df[df['Sign_fit'] == 1]['Pause'].mean()

In [8]:
df

Unnamed: 0,Start,End,Word,Probability,Duration,Length,Syllable,Rate,Rate_syllable,Pause,Sign,Sign_fit
0,0.66,0.78,с,0.98,0.12,1,1,8.333333,8.333333,0.0,,0
1,0.78,1.14,4-ёх,1.0,0.36,4,1,11.111111,2.777778,0.0,,0
2,1.14,1.56,столичных,1.0,0.42,9,3,21.428571,7.142857,0.0,,0
3,1.56,2.04,вокзалов,1.0,0.48,8,3,16.666667,6.25,0.0,,0
4,2.04,2.34,начали,1.0,0.3,6,3,20.0,10.0,0.0,,0
5,2.34,2.7,ходить,0.99,0.36,6,2,16.666667,5.555556,0.11,,0
6,2.81,3.3,зимние,0.67,0.49,6,3,12.244898,6.122449,0.0,,0
7,3.3,3.96,экспрессы,0.43,0.66,9,3,13.636364,4.545455,0.03,.,1
8,3.99,4.68,москвичей,0.99,0.69,9,3,13.043478,4.347826,0.0,,0
9,4.68,5.28,приглашают,0.95,0.6,10,4,16.666667,6.666667,0.0,,0


In [9]:
# используем датасет, построенный на основе этого файла в качестве обучающего
X_train, y_train = df.drop(['Start', 'End', 'Word', 'Probability', 'Sign', 'Sign_fit', 'Duration'],1), df['Sign_fit']

In [10]:
# масштабируем
scale_features_std = StandardScaler() 
X_train = scale_features_std.fit_transform(X_train) 

In [11]:
# строим логистическую регрессию
clf = LogisticRegression(C=1, random_state=0).fit(X_train, y_train)

In [12]:
# точность на обучающей выборке
clf.score(X_train, y_train)

0.9636363636363636

In [13]:
clf.predict(X_train)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int64)

In [14]:
df['Sign_predict'] = clf.predict(X_train)
df['Sign_predict'] = df['Sign_predict'].transform(lambda x: '.' if x == 1 else '')
df.head(len(df))

Unnamed: 0,Start,End,Word,Probability,Duration,Length,Syllable,Rate,Rate_syllable,Pause,Sign,Sign_fit,Sign_predict
0,0.66,0.78,с,0.98,0.12,1,1,8.333333,8.333333,0.0,,0,
1,0.78,1.14,4-ёх,1.0,0.36,4,1,11.111111,2.777778,0.0,,0,
2,1.14,1.56,столичных,1.0,0.42,9,3,21.428571,7.142857,0.0,,0,
3,1.56,2.04,вокзалов,1.0,0.48,8,3,16.666667,6.25,0.0,,0,
4,2.04,2.34,начали,1.0,0.3,6,3,20.0,10.0,0.0,,0,
5,2.34,2.7,ходить,0.99,0.36,6,2,16.666667,5.555556,0.11,,0,
6,2.81,3.3,зимние,0.67,0.49,6,3,12.244898,6.122449,0.0,,0,
7,3.3,3.96,экспрессы,0.43,0.66,9,3,13.636364,4.545455,0.03,.,1,
8,3.99,4.68,москвичей,0.99,0.69,9,3,13.043478,4.347826,0.0,,0,
9,4.68,5.28,приглашают,0.95,0.6,10,4,16.666667,6.666667,0.0,,0,


In [15]:
# берём следующий файл и проделываем с ним то же самое
df1 = pd.read_csv('news2.words', sep='\t', header=None)
df1 = make_preparations(df1)


In [16]:
df1['Sign'][20, 34, 37, 52, 55, 101, 109, 129, 159, 199, 216, 232, 252, 276, 285, 303, 313] = '.'
df1['Sign'][7, 24, 44, 80, 92, 118, 137, 148, 154, 160, 165, 174, 189, 204, 208, 227, 244, 257, 267, 277, 293, 304, 306] = ','
df1['Sign'][71, 196, 240] = ':'
df1['Sign'][106] = '-'
df1['Sign'][142] = '('
df1['Sign'][144] = ')'
df1['Sign_fit'] = df1['Sign'].transform(lambda x: 1 if x == '.' else 0).astype('int64')


In [17]:
df1['Pause'][len(df1)-1] = df1[df1['Sign_fit'] == 1]['Pause'].mean()
X_test_1, y_test_1 = df1.drop(['Start', 'End', 'Word', 'Probability', 'Sign', 'Sign_fit', 'Duration'],1), df1['Sign_fit']

In [18]:
X_test_1 = scale_features_std.fit_transform(X_test_1) 

In [19]:
# точность на выборке
clf.score(X_test_1, y_test_1)

0.9299363057324841

In [20]:
df1['Sign_predict'] = clf.predict(X_test_1)
df1['Sign_predict'] = df1['Sign_predict'].transform(lambda x: '.' if x == 1 else '')
df1.head(len(df1))

Unnamed: 0,Start,End,Word,Probability,Duration,Length,Syllable,Rate,Rate_syllable,Pause,Sign,Sign_fit,Sign_predict
0,0.11,0.29,в,1.0,0.18,1,1,5.555556,5.555556,0.0,,0,
1,0.29,0.95,пентагоне,1.0,0.66,9,4,13.636364,6.060606,0.0,,0,
2,0.95,1.31,сегодня,1.0,0.36,7,3,19.444444,8.333333,0.0,,0,
3,1.31,1.91,жаловались,1.0,0.6,10,4,16.666667,6.666667,0.0,,0,
4,1.91,2.03,на,1.0,0.12,2,1,16.666667,8.333333,0.0,,0,
5,2.03,2.57,российский,0.73,0.54,10,3,18.518519,5.555556,0.0,,0,
6,2.57,3.08,истребитель,0.79,0.51,11,4,21.568627,7.843137,0.0,,0,
7,3.08,3.89,су-27,0.64,0.81,5,1,6.17284,1.234568,0.0,",",0,
8,3.89,4.19,который,0.58,0.3,7,3,23.333333,10.0,0.0,,0,
9,4.19,4.25,в,1.0,0.06,1,1,16.666667,16.666667,0.0,,0,


In [21]:
# берём третий файл
df2 = pd.read_csv('speech.words', sep='\t', header=None)
df2 = make_preparations(df2)


In [22]:
df2['Sign'][8, 12, 19, 25] = '.'
df2['Sign'][21] = ','
df2['Sign'][1, 27, 28] = '!'
df2['Sign_fit'] = df2['Sign'].transform(lambda x: 1 if x == '.' or x == '!' else 0).astype('int64')

In [23]:
df2['Pause'][len(df2)-1] = df2[df2['Sign_fit'] == 1]['Pause'].mean()
X_test_2, y_test_2 = df2.drop(['Start', 'End', 'Word', 'Probability', 'Sign', 'Sign_fit', 'Duration'],1), df2['Sign_fit']
X_test_2 = scale_features_std.fit_transform(X_test_2) 

In [24]:
# точность на выборке
clf.score(X_test_2, y_test_2)

0.896551724137931

In [25]:
df2['Sign_predict'] = clf.predict(X_test_2)
df2['Sign_predict'] = df2['Sign_predict'].transform(lambda x: '.' if x == 1 else '')
df2.head(len(df2))

Unnamed: 0,Start,End,Word,Probability,Duration,Length,Syllable,Rate,Rate_syllable,Pause,Sign,Sign_fit,Sign_predict
0,2.09,2.57,доброе,1.0,0.48,6,3,12.5,6.25,0.0,,0,
1,2.57,3.05,утро,1.0,0.48,4,2,8.333333,4.166667,0.53,!,1,
2,3.58,4.47,московское,0.98,0.89,10,4,11.235955,4.494382,0.0,,0,
3,4.47,4.95,время,1.0,0.48,5,2,10.416667,4.166667,0.12,,0,
4,5.07,5.46,8,1.0,0.39,1,1,2.564103,2.564103,0.0,,0,
5,5.46,5.88,часов,1.0,0.42,5,2,11.904762,4.761905,0.0,,0,
6,5.88,6.42,утра,1.0,0.54,4,2,7.407407,3.703704,0.49,,0,
7,6.91,8.11,28,1.0,1.2,2,1,1.666667,0.833333,0.18,,0,
8,8.29,8.89,минут,1.0,0.6,5,2,8.333333,3.333333,2.14,.,1,.
9,11.03,11.48,я,1.0,0.45,1,1,2.222222,2.222222,0.0,,0,
