In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Data is from a dataset called "CatMeows", here is a link: https://zenodo.org/records/4008297

# I used a script to gather data about the F0 and convert
# that to a CSV, here is a link to the GitHub I took it from: https://github.com/YoeriNijs/PraatPitch
# I changed that script slightly, the one I used is at the bottom of this notebook
df = pd.read_csv("results.csv")
print(df.head(10))

                  Filename      Min      Max     Mean   Median     Std   Dur
0  B_ANI01_MC_FN_SIM01_101   390.20  2167.83   918.45   761.19  484.03  2.28
1  B_ANI01_MC_FN_SIM01_102   469.00  1972.95  1147.84   694.61  660.56  1.42
2  B_ANI01_MC_FN_SIM01_103   690.26  2036.39   935.35   747.81  447.14  1.80
3  B_ANI01_MC_FN_SIM01_301   610.33  2045.01  1212.04   750.90  592.02  1.74
4  B_ANI01_MC_FN_SIM01_302   520.31   649.53   583.96   601.32   41.37  1.27
5  B_ANI01_MC_FN_SIM01_303   617.02  2017.49   725.67   696.72  193.95  1.92
6  B_BAC01_MC_MN_SIM01_101   601.82  1426.68   790.44   739.94  229.81  1.16
7  B_BAC01_MC_MN_SIM01_102   342.36  1898.81   944.36   351.84  770.40  1.23
8  B_BAC01_MC_MN_SIM01_103   289.96   292.72   291.48   291.77    1.12  1.22
9  B_BAC01_MC_MN_SIM01_202  1785.14  1888.33  1836.73  1832.66   31.39  1.25


In [5]:
# The breed is the third part of the filename so we add a new column that extracts it
df['Breed'] = df['Filename'].apply(lambda x : x.split('_')[2])
print(df['Breed'].value_counts())

# We need it as a boolean for the classifier
df['Breed'] = df['Breed'].apply(lambda x : 0 if x == 'EU' else 1)
print(df['Breed'].value_counts())

EU    252
MC    188
Name: Breed, dtype: int64
0    252
1    188
Name: Breed, dtype: int64


In [6]:
forest = RandomForestClassifier(random_state=0)

model_df = df.drop(columns=['Filename'])
train, test = train_test_split(model_df, test_size=0.1, random_state=0)
train_y = train['Breed']
train_X = train.drop(columns='Breed')

test_y = test['Breed']
test_X = test.drop(columns='Breed')

forest.fit(train_X, train_y)
print(forest.score(test_X, test_y) * 100)

77.27272727272727


In [7]:
max_name = ""
max_importance = 0.0
for (col_name, importance) in zip(train_X.columns, forest.feature_importances_):
  print("Name: " + col_name + ", importance: ", importance)
  if importance > max_importance:
    max_importance = importance
    max_name = col_name

print("Most important column: " + max_name)

Name: Min, importance:  0.18387563702788284
Name: Max, importance:  0.14807906041129543
Name: Mean, importance:  0.1389484286183741
Name: Median, importance:  0.16204789790186166
Name: Std, importance:  0.16305076947764977
Name: Dur, importance:  0.20399820656293624
Most important column: Dur


([link to GitHub of script](https://github.com/YoeriNijs/PraatPitch))
I slightly changed the Praat script I got from GitHub so here it is in case you try to replicate my data exactly. Don't forget to add the backslash at the end of the "sound directory" when you run it:
```
# Praat script to compute mean pitch of audio files in a specific directory
# Outcome is stored in a csv file
#
# Yoeri Nijs
# Master Thesis Human Aspects of Information Technology
# Tilburg University
#
# Based on the work of Jonas Lindh, 2005-2006
# http://www.ling.gu.se/~jonas/sounds/
# GNU General Public License

# Script form
form Compute pitch of audio files
	comment Directory of sound files
	text sound_directory Full path of audio directory
	sentence Sound_extension .mp3
	comment Full path of text file with results
	text resultfile Full path of results directory\results.csv
	comment Pitch analysis parameters
	positive Time_step 0.01
	positive Minimum_pitch_(Hz) 200
	positive Maximum_pitch_(Hz) 600
endform

# Create listing of all sound files
Create Strings as file list... list 'sound_directory$'*'sound_extension$'
numberFiles = Get number of strings

# Check if the result file already exists
if fileReadable (resultfile$)
	pause The resultfile 'resultfile$' already exists! Overwrite?
	filedelete 'resultfile$'
endif


# Create row with column titles to the result file
titleline$ = "Filename,Min,Max,Mean,Median,Std,Dur'newline$'"
fileappend "'resultfile$'" 'titleline$'

# Compute all the sound files
for ifile to numberFiles

	# Open audio file
	filename$ = Get string... ifile

	# Read file and retrieve details
	Read from file... 'sound_directory$''filename$'
	soundname$ = selected$ ("Sound", 1)
	dur = Get total duration
	To Pitch... time_step minimum_pitch maximum_pitch
	max = Get maximum... 0 0 Hertz None
	min = Get minimum... 0 0 Hertz None
	mean = Get mean... 0 0 Hertz
	median = Get quantile... 0 0 0.5 Hertz
	stdev = Get standard deviation... 0 0 Hertz
	altqb = Get quantile... 0 0 0.0764 Hertz
	baseline = mean - (1.43 * stdev)
	To PointProcess
	points = Get number of points
	Remove

	# Save result to csv file
	resultline$ = "'soundname$','min:2','max:2','mean:2','median:2','stdev:2','dur:2''newline$'"
	fileappend "'resultfile$'" 'resultline$'

	# Remove temp objects from object's list
	select Sound 'soundname$'
	plus Pitch 'soundname$'
	Remove
	select Strings list

	# Next audio file
endfor
Remove
```

