In [None]:
# -*- coding: utf-8 -*-
"""
@author: dib_n
"""
#################################################################
#Imports
#################################################################
import pandas as pd
import numpy as np

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Learning
from sklearn import svm

#SearchGrid
from sklearn.model_selection import GridSearchCV

#Saving
from sklearn.externals import joblib

#################################################################
#Loading Models
#################################################################
encoder = joblib.load('../models/encoder.pkl')
scaler = joblib.load('../models/scaler.pkl')
models ={}
for target in ['geral', 'a', 'b', 'c', 'd', 'h', 'm', 'n', 'x', '6', '7']:
    if(target=='geral'):
        models[target]=joblib.load('../models/SVCrbf.pkl')
    else:
        models[target]=joblib.load('../models/SVCrbf_'+target+'.pkl')

#################################################################
#To Stack multiple models, an heuristic will be used
#Heuristica:
class Voter():
    def __init__(self,models,targets,encoder):
        self.targets=targets
        self.models=models
        self.encoder = encoder
    
    def check_line(self,x):
        x=x.copy()
        #print(x)
        x_temp = x.drop('geral')
        if(x_temp.sum()==1):
            for i,t in zip(x_temp.index,x_temp):
                #print(i)
                if(t==1):
                    return i
        else:
            return x['geral']
        
    def Score(self,X,y):
        predicts = pd.Series(self.predict(X))
        return (predicts==y).value_counts()[True]/len(y)
    
    def predict(self,X):
        df_predicoes = pd.DataFrame(columns=self.targets)
        for t in targets:
            df_predicoes[t]=self.models[t].predict(X)
        df_predicoes['geral'] = encoder.inverse_transform(df_predicoes['geral'])
        df_predicoes['predicts'] = df_predicoes.apply(lambda x: self.check_line(x),axis=1)
        return np.array(df_predicoes['predicts'])
#################################################################

#################################################################
#Preping data with external file   
#################################################################
exec(open('../scripts/dataprep_teste.py').read())

In [37]:
#################################################################
#Predicting all files
#################################################################
df_teste = pd.read_csv('../data/data_teste.csv')
df_raw = df_teste.copy()
#Dropping filename for predicitons
df_teste = df_teste.drop('filename',axis=1)
#Scailing
df_teste = scaler.transform(df_teste)
#Predicting
targets = ['geral', 'a', 'b', 'c', 'd', 'h', 'm', 'n', 'x', '6', '7']
vt = Voter(models,targets,encoder)
predictions = vt.predict(df_teste)

#predictions = svmrbf.predict(df_teste)
#predictions = encoder.inverse_transform(predictions)
#print(df_raw['filename'])

#################################################################
#Assembling
#################################################################

In [40]:
df_respostas = pd.DataFrame(columns=['filename_extended'])
df_respostas['filename_extended'] = df_raw['filename']
df_respostas['predictions'] = predictions


df_respostas['filename'] = df_respostas['filename_extended'].apply(lambda x: x.split('_')[0])
df_respostas['file_pos'] = df_respostas['filename_extended'].apply(lambda x: x.split('_')[2])
df_respostas['file_pos'] = df_respostas['file_pos'].str.replace('.wav','')

df_final = pd.DataFrame(
    df_respostas[['filename','predictions']].groupby('filename').predictions.apply(lambda x: x.sum())
).reset_index()

print(df_final)
df_final.to_csv('../data/resultado_predicoes.csv',index=False)
print('Resultados salvos no arquivo "resultado_predicoes.csv"')

    filename predictions
0       66a6        66a6
1       66ax        66ax
2       66hh        66hh
3       66hn        66hn
4       6767        6767
..       ...         ...
262     xhcc        xhc6
263     xmbc        xnbc
264     xmm7        xnn7
265     xnc7        xmc7
266     xxxh        xxxh

[267 rows x 2 columns]
Resultados salvos no arquivo "resultado_predicoes.csv"


In [109]:
df_valid = pd.read_csv('../data/data_validacao.csv')

df_valid.drop('filename',axis=1,inplace=True)

X_valid = scaler.transform(df_valid.iloc[:,:-1])

results = pd.DataFrame()
predictions = vt.predict(X_valid)
results['real'] = df_valid['label'].copy()
results['predict'] = predictions
results['result'] = (predictions==df_valid['label'])
mistakes=results.loc[results['result']==False]

mistakes['par']=mistakes['predict']+mistakes['real']
mistakes['par']=mistakes['par'].apply(lambda x: ''.join(sorted(x)))

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(15,7))
plt.plot(mistakes['par'].value_counts().cumsum()/mistakes['par'].value_counts().sum())

In [74]:
df_treino = pd.read_csv('../data/data.csv')
df_treino.drop('filename',axis=1,inplace=True)

In [75]:
X_treino = scaler.transform(df_treino.iloc[:,:-1])

In [78]:
df_treino2 = pd.DataFrame(X_treino.copy())
df_treino2['label']=df_treino['label'].copy()

In [94]:
#bd
from sklearn.ensemble import RandomForestClassifier
SVCClassifier = RandomForestClassifier()

In [95]:
X_treino2 = df_treino2.loc[(df_treino2['label']=='b') | (df_treino2['label']=='d')].iloc[:,:-1]
y_treino2 = df_treino2.loc[(df_treino2['label']=='b') | (df_treino2['label']=='d')].iloc[:,-1]
y_treino2=y_treino2.apply(lambda x: 1 if x=='b' else 0)

In [103]:
#Param grid
Cs = np.arange(0.5,100,0.5)
gammas = [0.001, 0.01, 0.1, 1]

param_grid = {'C':Cs,'gamma':gammas}
#################################################################
#Grid Search
print('Tuning RBF Kernel parameters')
grid_search = GridSearchCV(svm.SVC(kernel='rbf'),param_grid)
grid_search.fit(X_treino2,y_treino2)
print('Search grid for RBF returned parameters:')
print(grid_search.best_params_)
#Get model  
SVCrbf = grid_search.best_estimator_
print('Params:')
print(SVCrbf.get_params())
#SVCrbf.fit(X_train,y_train)
print('Accuracy score of RBF Kernel:',SVCrbf.score(X_valid2,y_valid))
joblib.dump(SVCrbf, '../models/svcrbf.pkl') 

Tuning RBF Kernel parameters
Search grid for RBF returned parameters:
{'C': 38.0, 'gamma': 0.001}
Params:
{'C': 38.0, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 0.001, 'kernel': 'rbf', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}
Accuracy score of RBF Kernel: 0.5416666666666666


['../models/svcrbf.pkl']

In [97]:
X_valid2 = df_valid.loc[(df_valid['label']=='b') | (df_valid['label']=='d')].iloc[:,:-1]
y_valid = df_valid.loc[(df_valid['label']=='b') | (df_valid['label']=='d')].iloc[:,-1]
X_valid2 = scaler.transform(X_valid2)
y_valid = y_valid.apply(lambda x: 1 if x=='b' else 0)