In [1]:
from matplotlib.colors import ListedColormap
from sklearn import cross_validation, datasets, linear_model, metrics
from pandas import read_csv, concat, DataFrame
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp

In [2]:
#%pylab inline

In [3]:
allIndic = ["zcr", "mfcc", "energy", "autoc"]
type  = "max"
all = True
useSilent = True
if useSilent:
    dataSets = ["noice", "silent", "voice"]
else:
    dataSets = ["noice", "voice"]

In [4]:
data = None
data_answer = np.array([])
for dataSet in dataSets:
    currentData = DataFrame({})
    for indic in allIndic:
        file = dataSet + "_anal/anal_" + indic + ".txt"
        csv = read_csv(file, "\t")
        currentData = concat([currentData, csv], axis=1, join='inner')
        #print(csv)
    if data is None:
        data = currentData
    else:
        data = concat([data,currentData], axis=0)
        #print(data)
    if dataSet == "voice":
        #print("voice")
        data_answer = np.concatenate((data_answer, np.ones(currentData["zcr_sum"].size)))
    else:
        #print("not voice")
        data_answer = np.concatenate((data_answer, np.zeros(currentData["zcr_sum"].size)))
    
    
  
        

In [5]:
#print(data.keys)
print (data_answer.size)

3784


In [6]:
print(data[:5])

   zcr_sum   zcr_avg   zcr_min   zcr_max  mfcc_sum  mfcc_avg  mfcc_min  \
0    202.0  0.197266  0.197266  0.197266 -0.002441 -0.000005 -0.497131   
1    316.0  0.308594  0.308594  0.308594  0.002319  0.000002 -0.018280   
2    118.0  0.115234  0.115234  0.115234  0.003296  0.000003 -0.003510   
3    369.0  0.360352  0.360352  0.360352 -0.002869 -0.000003 -0.001343   
4    294.0  0.287109  0.287109  0.287109  0.002472  0.000002 -0.001038   

   mfcc_max   energy_sum  energy_avg  energy_min  energy_max  energy_value  \
0  0.210632     0.000000    0.000000    0.000000    0.000000      0.000000   
1  0.018188  1271.336925    0.620770    0.620770    0.620770      0.385355   
2  0.002716    33.460710    0.032676    0.032676    0.032676      0.001068   
3  0.001801    12.694195    0.012397    0.012397    0.012397      0.000154   
4  0.001190    12.144274    0.011860    0.011860    0.011860      0.000141   

   autoc_sum     autoc_avg  autoc_min  autoc_max  
0   0.188753  1.843295e-04  -0.0689

In [7]:
print(data["zcr_sum"].size)

3784


In [8]:
np.random.RandomState(seed =1)

<mtrand.RandomState at 0x4c3b0894e0>

In [9]:
if all:
    train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(data, data_answer, 
                                                                                    test_size = 0.3,
                                                                                    random_state = 1)
else:
    train_data, test_data, train_labels, test_labels = cross_validation.train_test_split(data[type], data_answer, 
                                                                                    test_size = 0.3,
                                                                                    random_state = 1)

### Свой классификатор

In [10]:
def energy_result(x_in):
    x = np.array([x_in[8], x_in[9], x_in[10], x_in[11]])
    if x_in[12] < 0.02:
        return -10
    add = 0
    #if abs(x[0]) < 0.02:
#            add -= 10
    return (-0.43582075 + np.dot(x, [-5.19400273e-05, 7.44976464e-01, 7.44976464e-01, 7.44976464e-01]) > 0) +  add

In [11]:
def zcr_result(x_in):
    x = np.array([x_in[0], x_in[1], x_in[2], x_in[3]])
    add = 1.2667888
    return (np.dot(x, [6.03061087e-04, -1.87091677e+00, -1.87091677e+00, -1.87091677e+00]) +  add) > 0

In [12]:
def mfcc_result(x_in):
    x = np.array([x_in[4], x_in[5], x_in[6], x_in[7]])
    add = 0.61336651
    return (np.dot(x,  [-8.70000408e-02, -4.49398273e-04, 5.31164313e+00, 2.36476296e+00]) +  add) > 0

In [13]:
def autoc_result(x_in):
    x = np.array([x_in[13], x_in[14], x_in[15], x_in[16]])
    add = 0.59867863
    return (np.dot(x,  [-0.15975306,  0.0412668 , -0.41195572, -0.15356962]) +  add) > 0

In [14]:
def summary(x):
    return energy_result(x) + mfcc_result(x) + zcr_result(x) >= 2

In [15]:
def summary_score(x):
    res = energy_result(x) + mfcc_result(x) + zcr_result(x)
    if res <= 0:
        return -1.0
    elif res == 1:
        return -0.25
    elif res == 2:
        return 0.25
    else:
        return 1.0

In [16]:
#list(map(lambda x:  summary(x), test_data.values))

In [17]:
myown_predictions = np.array(list(map(lambda x: 1 if summary(x) else 0, test_data.values)))

In [18]:
myown_score = np.array(list(map(lambda x: summary_score(x), test_data.values)))

In [19]:
#print(myown_score)

In [20]:
#print(myown_predictions[0:50])

In [21]:
#print (test_labels[0:50])

In [22]:
metrics.accuracy_score(test_labels, myown_predictions)

0.85915492957746475

In [23]:
metrics.roc_auc_score(test_labels, myown_predictions)

0.86113886113886107

In [24]:
#metrics.pr_auc_score(test_labels, myown_predictions)

In [25]:
metrics.precision_score(test_labels, myown_predictions, pos_label = 0) #шум

0.8214285714285714

In [26]:
metrics.precision_score(test_labels, myown_predictions, pos_label = 1) #речь

0.89583333333333337

In [27]:
print(type(myown_predictions))

TypeError: 'str' object is not callable

In [28]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

#fpr, tpr, thresholds = metrics.roc_curve(test_labels, myown_predictions)
#roc_auc = auc(y_true, y_score)
for i in range(2):
    print(i)
    fpr[i], tpr[i], _ = metrics.roc_curve(test_labels, myown_predictions)
    roc_auc[i] = metrics.auc(fpr[i], tpr[i])

0
1


In [29]:
##############################################################################
# Plot of a ROC curve for a specific class
plt.figure()
plt.plot(fpr[1], tpr[1], label='ROC curve (area = %0.2f)' % roc_auc[1])
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
#plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()