In [76]:
import numpy as np
import librosa
import matplotlib.pyplot as plt 
import spectrum
import os
import pandas as pd
from sklearn.mixture import GaussianMixture
import joblib
import pickle

In [2]:
frame_size = 32/1000 # seconds
hop_size = 10/1000 # seconds
pre_emphasis_value = 0.97
lpc_dimention = 39

# preprocess functions

In [3]:
def read_voice(path,rate=None):
    return librosa.load(path,sr=rate)

def pre_emphasis(signal,pre_emphasis_value):
    return np.append(signal[0], signal[1:] - pre_emphasis_value * signal[:-1])

def framming(signal,frame_length,hop_size,sample_rate):
    frame_step = hop_size * sample_rate
    signal_lenght = len(signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(np.ceil(float(np.abs(signal_lenght - frame_length)) / frame_step))
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_lenght))
    pad_signal = np.append(signal, z) 
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    return frames

def windowing(frames , frame_length , window_type=np.hamming):
    if window_type is not None:
        return frames*window_type(frame_length)
    else:
        return frames
    
def preprocess(signal,sample_rate,frame_size,hop_size,pre_emphasis_value,window_type=np.hamming , use_emphasis=True):
    frame_length = frame_size*sample_rate
    if use_emphasis:
        signal = pre_emphasis(signal,pre_emphasis_value)
    framing_signals = framming(signal,frame_length,hop_size,sample_rate)
    return windowing(framing_signals,frame_length,window_type)



# VAD function (part 2)

In [4]:
def log_short_time_energy(frame):
    return np.log10(np.sum(frame**2)/len(frame))

def VAD(frames):
    log_short_time_energies = np.apply_along_axis(log_short_time_energy,axis=1,arr=frames)
    return frames[np.where(log_short_time_energies> -6)]


# LPC and MFCC features functions (part 3)

In [5]:
def get_lpc(frame):
    return spectrum.lpc(frame, lpc_dimention)[0]

def get_mfcc(frame):
    mfcc_0 = librosa.feature.mfcc(frame, sr = rate, n_mfcc = 13).reshape(13)
    mfcc_1 = librosa.feature.delta(mfcc_0, order = 1)
    mfcc_2 = librosa.feature.delta(mfcc_0, order = 2)
    return np.concatenate((mfcc_0,mfcc_1,mfcc_2),axis=0)


def lpc_feature_of_frames(frames):
    return np.apply_along_axis(get_lpc,axis=1,arr=frames)

def mfcc_feature_of_frames(frames):
    return np.apply_along_axis(get_mfcc,axis=1,arr=frames) 


# read data and make dataframe

In [6]:
def get_pathes(path):  
    pathes = []  
    for root, directories, files in os.walk(path):
        for filename in files:
            pathes.append(os.path.join(root, filename))  
    return pathes 

In [7]:
train_dict = {
    "speaker":[],
    "data":[],
    "lpc":[],
    "mfcc":[]
}

test_dict={
    "speaker":[],
    "data":[],
    "lpc":[],
    "mfcc":[]
}

for path in get_pathes("Dataset"):
    data_type,speaker = path.split("\\")[1:3]
    data,rate = read_voice(path)
    frames = preprocess(data,rate,frame_size,hop_size,pre_emphasis_value)
    vad_frames = VAD(frames)
    lpc_features = lpc_feature_of_frames(vad_frames)
    mfcc_features = mfcc_feature_of_frames(vad_frames)
    
    if data_type.strip()=="test":
        current_dict = test_dict        
    else:
        current_dict = train_dict
        
    current_dict["speaker"].append(speaker.strip())
    current_dict["data"].append(data)
    current_dict["lpc"].append(lpc_features)
    current_dict["mfcc"].append(mfcc_features)
    


    



In [8]:
train_df=pd.DataFrame(train_dict)
train_df

Unnamed: 0,speaker,data,lpc,mfcc
0,F(1),"[6.1035156e-05, 9.1552734e-05, 3.0517578e-05, ...","[[1.4563812458243126, 0.9734564058865107, 0.69...","[[-512.902942539292, -87.21679914300745, 8.808..."
1,F(1),"[6.1035156e-05, 0.00012207031, 9.1552734e-05, ...","[[1.9527878888746457, 2.3622633769180195, 2.12...","[[-588.3390532024175, -141.80973402670332, 10...."
2,F(1),"[0.0, -3.0517578e-05, 0.00021362305, 9.1552734...","[[1.3863004580810285, 0.9387189543468455, 0.41...","[[-536.0236193710593, -84.11102957795637, 14.7..."
3,F(1),"[0.0, 9.1552734e-05, 3.0517578e-05, 6.1035156e...","[[1.5528975801683351, 1.0661701144546913, 0.56...","[[-527.2012083712265, -101.0299424222678, 10.6..."
4,F(1),"[3.0517578e-05, 6.1035156e-05, 9.1552734e-05, ...","[[1.5127889366789529, 1.7014116413871434, 1.48...","[[-562.9820656752108, -124.09638714589191, -22..."
...,...,...,...,...
135,M(9),"[0.00012207031, 0.00015258789, 0.00018310547, ...","[[1.0208700098013677, 0.5222866061128987, 0.64...","[[-508.3579199041774, -69.69675374277344, -8.9..."
136,M(9),"[3.0517578e-05, 6.1035156e-05, 0.0, 3.0517578e...","[[-0.37339625780308533, 0.28436555985673795, 0...","[[-446.086760858977, 1.6695129308415528, -27.0..."
137,M(9),"[-0.0002746582, 0.0, 0.00024414062, 0.00030517...","[[0.4627896588547296, 0.06577095523537509, 0.1...","[[-353.9932404123551, -32.971926438511275, -4...."
138,M(9),"[0.0002746582, 3.0517578e-05, -0.00030517578, ...","[[-0.1651857554359213, 0.489007158464017, 0.36...","[[-369.6840809416954, -23.08354287016411, -62...."


In [9]:
test_df= pd.DataFrame(test_dict)
test_df

Unnamed: 0,speaker,data,lpc,mfcc
0,F(1),"[3.0517578e-05, 0.00015258789, 3.0517578e-05, ...","[[1.0385847993279997, 0.9015660629198482, 0.98...","[[-530.3385136994322, -92.79224347386912, -41...."
1,F(1),"[3.0517578e-05, 9.1552734e-05, 3.0517578e-05, ...","[[0.9331270221486008, 0.7723660735682877, 0.81...","[[-490.4131468386102, -76.97502548944233, -23...."
2,F(1),"[6.1035156e-05, 6.1035156e-05, 3.0517578e-05, ...","[[1.0912180978019623, 0.8953002077887737, 0.42...","[[-491.59157175866096, -80.57506971512494, 2.7..."
3,F(10),"[-0.00021362305, 0.00012207031, 0.0, 9.1552734...","[[0.08745493438970442, 0.7138918077358266, 0.1...","[[-458.8177575229361, -42.404706157823526, -46..."
4,F(10),"[-0.00021362305, -0.00015258789, -0.0001525878...","[[-1.4182757734046054, 0.0322659635792153, 0.9...","[[-571.4247579946723, 105.09460132577883, -37...."
5,F(10),"[0.0002746582, 6.1035156e-05, 3.0517578e-05, 6...","[[-1.1267224871833483, 0.9149485885327264, 0.0...","[[-333.1661285420802, 40.090356109885846, -99...."
6,F(2),"[0.00012207031, 3.0517578e-05, -0.00015258789,...","[[0.712516810690293, 0.3407226209705845, -0.39...","[[-428.64906688570363, -50.04104583712864, 23...."
7,F(2),"[3.0517578e-05, 9.1552734e-05, 0.00015258789, ...","[[-0.7064209001706356, 0.3074517223669166, 0.4...","[[-369.46508947536535, 30.477844632237456, -76..."
8,F(2),"[9.1552734e-05, 6.1035156e-05, 0.0, 3.0517578e...","[[-0.31898008994374993, 0.08422592151683711, -...","[[-495.92921482698506, 3.932843398296052, -4.0..."
9,F(3),"[3.0517578e-05, 6.1035156e-05, -6.1035156e-05,...","[[-0.28885154306594885, 0.15398866492722693, 0...","[[-377.5515104957849, -11.89491087295962, -74...."


# GMM (part 4)

In [28]:
def get_LPC_GMM_model_of_speaker(df):
    lpc_list = [item for item in  df["lpc"]]
    lpc_arr = np.concatenate(lpc_list,axis=0)
    lpc_gmm_model = GaussianMixture(n_components=32,covariance_type='diag',max_iter=100)
    lpc_gmm_model.fit(lpc_arr)
    return lpc_gmm_model

def get_MFCC_GMM_model_of_speaker(df):
    mfcc_list = [item for item in  df["mfcc"]]
    mfcc_arr = np.concatenate(mfcc_list,axis=0)
    lpc_gmm_model = GaussianMixture(n_components=32,covariance_type='diag',max_iter=100)
    lpc_gmm_model.fit(mfcc_arr)
    return lpc_gmm_model


def get_GMMs():
    lpc_GMMs = []
    mfcc_GMMs=[]
    for gp in train_df.groupby("speaker"):
        lpc_GMMs.append(get_LPC_GMM_model_of_speaker(gp[1]))
        mfcc_GMMs.append(get_MFCC_GMM_model_of_speaker(gp[1]))
    return lpc_GMMs,mfcc_GMMs


In [29]:
lpc_GMMs , mfcc_GMMs = get_GMMs()

# predict speakers (part 5 and 6)

In [30]:
def predict(row,lpc_GMMs,mfcc_GMMs):
    lpc_gmm_scores = np.array([gmm.score(row["lpc"]) for gmm in lpc_GMMs])
    mfcc_gmm_scores = np.array([gmm.score(row["mfcc"]) for gmm in mfcc_GMMs])
    
    return list(reversed(lpc_gmm_scores.argsort()[-3:])) ,list(reversed(mfcc_gmm_scores.argsort()[-3:]))

In [31]:
def predict_top_3(row):
    lpc_labels , mfcc_labels = predict(row,lpc_GMMs,mfcc_GMMs)
    return lpc_labels,mfcc_labels

    

In [32]:
def accuracy(pred_df , top_count=1 , feature_name="lpc"):
    bool_df = pred_df.apply(lambda row:row["real"] in row[f"{feature_name}_pred"][:top_count],axis=1)
    return bool_df[bool_df].count()/bool_df.count()
    

In [33]:
preds = test_df.apply(predict_top_3,axis=1)

In [34]:
pred_df = pd.DataFrame(
{
    "real":test_df["speaker"].map({item:i for i,item in enumerate(train_df["speaker"].unique())}),
    "lpc_pred":[item[0] for item in preds],
    "mfcc_pred":[item[1] for item in preds]
}
)


In [35]:
pred_df

Unnamed: 0,real,lpc_pred,mfcc_pred
0,0,"[0, 1, 9]","[0, 1, 9]"
1,0,"[0, 3, 1]","[0, 1, 3]"
2,0,"[0, 1, 6]","[0, 1, 3]"
3,1,"[1, 0, 6]","[1, 0, 9]"
4,1,"[1, 0, 6]","[1, 0, 6]"
5,1,"[1, 0, 6]","[1, 0, 9]"
6,2,"[9, 2, 6]","[2, 6, 9]"
7,2,"[0, 9, 2]","[6, 9, 7]"
8,2,"[2, 1, 0]","[2, 8, 5]"
9,3,"[3, 12, 0]","[3, 0, 16]"


In [36]:
for feature_name in ("lpc","mfcc"):
    for top_count in (1,3):
        print(f"accuracy of top_{top_count} for GMM with {feature_name} featuers= {accuracy(pred_df,top_count,feature_name)}")


accuracy of top_1 for GMM with lpc featuers= 0.85
accuracy of top_3 for GMM with lpc featuers= 0.9166666666666666
accuracy of top_1 for GMM with mfcc featuers= 0.9333333333333333
accuracy of top_3 for GMM with mfcc featuers= 0.9666666666666667


# save matrixes (part 7)

In [71]:
def save_matrxies(df,feature_name,file_name):
    feature_list = [item for item in  df[feature_name]]
    arr = np.concatenate(feature_list,axis=0)
    joblib.dump(arr,file_name)
    

### LPC train Matrix

In [72]:
file_name = "train_lpc_matrixes.joblib"
save_matrxies(train_df,"lpc",file_name)
load_arr = joblib.load(file_name)
load_arr , load_arr.shape

(array([[ 1.45638125,  0.97345641,  0.6965763 , ...,  0.08847128,
         -0.00296551, -0.06194059],
        [ 1.67237081,  1.36064226,  0.84380919, ..., -0.24112262,
         -0.17305928, -0.08647555],
        [ 1.38485229,  1.31950735,  1.26179414, ..., -0.16230371,
         -0.14958606, -0.0485752 ],
        ...,
        [ 1.2980477 ,  1.08224382,  1.17869319, ...,  0.4942878 ,
          0.37372362,  0.09982694],
        [ 1.22934132,  1.01139746,  1.11472236, ...,  0.23906175,
          0.22039954,  0.0546037 ],
        [ 0.86129695,  0.60052327,  0.82701984, ..., -0.00847739,
          0.02648754,  0.02309436]]),
 (28935, 39))

### LPC test Matrix

In [73]:
file_name = "test_lpc_matrixes.joblib"
save_matrxies(test_df,"lpc",file_name)
load_arr = joblib.load(file_name)
load_arr , load_arr.shape

(array([[ 1.03858480e+00,  9.01566063e-01,  9.84579594e-01, ...,
          8.76845488e-02, -8.28831495e-03, -3.78775893e-02],
        [ 1.09756988e+00,  8.52303312e-01,  9.46819374e-01, ...,
          1.28031639e-01,  4.54901831e-02, -1.27056434e-02],
        [ 1.13704107e+00,  7.89506692e-01,  9.27490639e-01, ...,
         -5.98275486e-02, -1.01569181e-01, -7.96305138e-02],
        ...,
        [ 1.28727915e+00,  9.40020417e-01,  9.62774458e-01, ...,
          1.43989645e-02, -4.50037066e-02,  1.01148388e-03],
        [ 1.40427245e+00,  1.06557558e+00,  1.04520841e+00, ...,
         -1.02041323e-01, -1.32481044e-01, -2.22421820e-02],
        [ 1.19906378e+00,  6.46148649e-01,  6.93280871e-01, ...,
         -4.31041293e-02,  7.32788887e-02,  6.64696258e-02]]),
 (10419, 39))

### mfcc train Matrix

In [74]:
file_name = "train_mfcc_matrixes.joblib"
save_matrxies(train_df,"mfcc",file_name)
load_arr = joblib.load(file_name)
load_arr , load_arr.shape

(array([[-5.12902943e+02, -8.72167991e+01,  8.80804212e+00, ...,
         -2.00761337e+00, -2.00761337e+00, -2.00761337e+00],
        [-5.13906789e+02, -1.05959417e+02,  1.99925822e+01, ...,
         -1.04211833e+00, -1.04211833e+00, -1.04211833e+00],
        [-5.04415708e+02, -1.07171103e+02, -7.25560961e+00, ...,
         -2.50033375e+00, -2.50033375e+00, -2.50033375e+00],
        ...,
        [-4.50604007e+02, -1.07643706e+02, -3.12042886e+01, ...,
         -3.10311537e-01, -3.10311537e-01, -3.10311537e-01],
        [-4.91794392e+02, -9.84509029e+01, -2.21476009e+01, ...,
         -2.26401296e-01, -2.26401296e-01, -2.26401296e-01],
        [-5.31343255e+02, -7.57084510e+01, -3.44707698e+01, ...,
         -7.10078208e-02, -7.10078208e-02, -7.10078208e-02]]),
 (28935, 39))

### mfcc test Matrix

In [75]:
file_name = "test_mfcc_matrixes.joblib"
save_matrxies(test_df,"mfcc",file_name)
load_arr = joblib.load(file_name)
load_arr , load_arr.shape

(array([[-5.30338514e+02, -9.27922435e+01, -4.18397669e+01, ...,
         -2.96875484e+00, -2.96875484e+00, -2.96875484e+00],
        [-5.10103549e+02, -8.69210377e+01, -3.11612560e+01, ...,
         -2.98077819e+00, -2.98077819e+00, -2.98077819e+00],
        [-5.37543552e+02, -8.86438253e+01, -2.14471278e+01, ...,
         -2.63150226e+00, -2.63150226e+00, -2.63150226e+00],
        ...,
        [-5.09529717e+02, -9.55593535e+01, -1.30083329e+01, ...,
          7.14435452e-01,  7.14435452e-01,  7.14435452e-01],
        [-5.36736661e+02, -1.02597825e+02, -1.46770484e+01, ...,
         -3.15541257e-01, -3.15541257e-01, -3.15541257e-01],
        [-5.46799977e+02, -8.50984942e+01, -1.45326517e+01, ...,
          2.88243688e-01,  2.88243688e-01,  2.88243688e-01]]),
 (10419, 39))

# save models (part 8)

In [78]:
for i,model in enumerate(lpc_GMMs):
    with open(f"lpc_model_{i}.pkl" , "wb") as f:
        pickle.dump(model,f)

In [82]:
with open("lpc_model_0.pkl","rb") as f:
    model = pickle.load(f)
    
model

GaussianMixture(covariance_type='diag', n_components=32)

In [83]:
for i,model in enumerate(mfcc_GMMs):
    with open(f"mfcc_model_{i}.pkl" , "wb") as f:
        pickle.dump(model,f)

In [84]:
with open("mfcc_model_0.pkl","rb") as f:
    model = pickle.load(f)
    
model

GaussianMixture(covariance_type='diag', n_components=32)