Importing the GPU libraries

In [99]:
#GPU libraries
import cudf as pd
import cupy as cp
import cuml as np
from cuml import LinearRegression
from cuml.linear_model import LinearRegression
from cuml import Ridge
from cuml.linear_model import Ridge
from cuml.model_selection import train_test_split
from cuml.linear_model import Lasso
from cuml.ensemble import RandomForestRegressor

Evaluation Metrics

In [100]:
#Evaluation Metrics
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, Normalizer, StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, clone
import sklearn_pandas
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score,train_test_split
from scipy import stats
# from sklearn.linear_model import LinearRegression
from scipy.special import boxcox1p

Environement Specificaftions 

In [101]:
#Environment Specs
import sys
import scipy

print('Environment specification:\n')
print('python', '%s.%s.%s' % sys.version_info[:3])

for mod in np, scipy, sns, sklearn, pd:
    print(mod.__name__, mod.__version__)

In [102]:
data = pd.read_csv('../input/voicegender/voice.csv')

In [103]:
data

* meanfreq: mean frequency of the voice audio of the person (in kHz)
* sd: standard deviation of the frequency of the voice audio
* median: median frequency of the voice audio (in kHz)
* Q25: first quantile (in kHz)
* Q75: third quantile (in kHz)
* IQR: interquantile range (in kHz)
* skew: Skewness refers to a distortion or asymmetry that deviates from the symmetrical bell curve, or normal distribution
* kurt: Kurtosis is a statistical measure that defines how heavily the tails of a distribution differ from the tails of a normal distribution.
* sp.ent: spectral entropy
* sfm: spectral flatness
* mode: mode frequency
* centroid: frequency centroid (see specprop)
* meanfun: mean fundamental frequency measured across acoustic signal
* minfun: minimum fundamental frequency measured across acoustic signal
* maxfun: maximum fundamental frequency measured across acoustic signal
* meandom: mean of dominant frequency measured across acoustic signal
* mindom: minimum of dominant frequency measured across acoustic signal
* maxdom: maximum of dominant frequency measured across acoustic signal
* dfrange: range of dominant frequency measured across acoustic signal
* modindx: modulation index

Checking for NULL values

In [104]:
data.isnull().sum()
#data seems to have no null values

Data Frame Shape and Visualization

In [105]:
data.info()

In [106]:
data.shape

In [107]:
print("Total number of labels: {}".format(data.shape[0]))

In [108]:
print("Number of male: {}".format(data[data.label == 'male'].shape[0]))
print("Number of female: {}".format(data[data.label == 'female'].shape[0]))

Correlation among Features

In [109]:
import seaborn
import matplotlib.pyplot as plt
import pandas
# using normal pandas to visualize the data 
df_pandas = pandas.read_csv('../input/voicegender/voice.csv')
plt.figure(figsize=(21,21))
seaborn.heatmap(df_pandas.corr(),annot=True,cmap='viridis',linewidth=0.5)

### Kernel Density Estimate Plot 
#### It is analagous to a histogram. It represents the data using a continuous probability density curve.

In [110]:
label_encode = LabelEncoder()
# Perform Encoding by coverting 'label' feature into numerical form
df_pandas['label'] = label_encode.fit_transform(df_pandas['label'])
print(df_pandas.head())
from matplotlib import pyplot as plt

plt.subplots(4,5,figsize=(30,30))
for i in range(1,21):
    plt.subplot(4,5,i)
    plt.title(df_pandas.columns[i-1])
    sns.kdeplot(df_pandas.loc[df_pandas['label'] == 0, df_pandas.columns[i-1]], color= 'pink', label='female')
    sns.kdeplot(df_pandas.loc[df_pandas['label'] == 1, df_pandas.columns[i-1]], color= 'blue', label='male')

https://github.com/amueller/mglearn

In [111]:
!pip install mglearn

In [112]:
import mglearn

In [131]:
# performing the visualization in pandas Data Frame 

print(df.head(10))
male = df.loc[df['label']=='male']
female = df.loc[df_pandas['label']=='female']
fig, axes = plt.subplots(10, 2, figsize=(10,20))
ax = axes.ravel()
for i in range(20):
    ax[i].hist(male.iloc[:,i], bins=20, color=mglearn.cm3(0), alpha=.5)
    ax[i].hist(female.iloc[:, i], bins=20, color=mglearn.cm3(2), alpha=.5)
    ax[i].set_title(list(male)[i])
    ax[i].set_yticks(())
    ax[i].set_xlabel("Feature magnitude")
    ax[i].set_ylabel("Frequency")
    ax[i].legend(["male", "female"], loc="best")

fig.tight_layout()

#### On analysing the above plots we can conclude that we can drop some features due to high correlation or because of not having great diffrence in male and female data sets. These features are sfm,kurt,meandom,meanfreq,dfrange,modindx

In [114]:
ND = df_pandas.drop(['sfm','kurt','meandom','meanfreq','dfrange','modindx'],axis=1)
ND

In [115]:
new_data = data.drop(['sfm','kurt','meandom','meanfreq','dfrange','modindx'],axis=1)
new_data

In [116]:
plt.figure(figsize=(16,16))
seaborn.heatmap(ND.corr(),annot=True,cmap='viridis',linewidth=0.5)

#### Relatively less features have high correlation 

### Train , Test and Split the data using cuml.model_selection

In [117]:
new_data['label'] = new_data['label'].map({'male': 1, 'female' : 0})
print(new_data)
from cuml.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(new_data.iloc[ : , : -1].values, new_data.iloc[ : ,  -1].values, test_size=0.2 , random_state = 42)


### Applying different models , to see which one gives the highest accuracy 

#### Random Forrest Classfier 

In [118]:
from cuml.ensemble import RandomForestClassifier as cuRF


In [119]:
n_samples = 1000
n_features = 10
n_classes = 2

# random forest depth and size
n_estimators = 25
max_depth = 10
model = cuRF( max_depth = max_depth,
              n_estimators = n_estimators,
              random_state  = 0 )

trained_RF = model.fit ( X_train, y_train )


In [120]:
predictions = model.predict ( X_test )

In [121]:
from cuml.datasets.classification import make_classification
# from cuml.model_selection import train_test_split
# from cuml.ensemble import RandomForestClassifier as cuRF
from cuml.metrics import accuracy_score
cu_score = np.metrics.accuracy_score( y_test, predictions )
sk_score = accuracy_score( ( y_test ).get(), ( predictions.get() ) )

print( " cuml accuracy: ", cu_score )
print( " sklearn accuracy : ", sk_score )

In [122]:
print(predictions)

In [123]:
from cuml.svm import SVC
classifier1 = SVC(kernel = 'rbf')
classifier1.fit(X_train, y_train)
print('Using SVM classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier1.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier1.score(X_test,y_test)))


In [124]:
from cuml.neighbors import KNeighborsClassifier
classifier2 = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier2.fit(X_train, y_train)
print('Using K nearest Classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier2.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier2.score(X_test,y_test)))

In [125]:
from cuml.svm import SVC
classifier3 = SVC(kernel = 'linear')
classifier3.fit(X_train, y_train)
print('Using SVM classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier3.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier3.score(X_test,y_test)))

In [126]:
from cuml.naive_bayes import GaussianNB
classifier5 = GaussianNB()
classifier5.fit(X_train, y_train)
print('Using Naive Bayes classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier5.score(X_train,y_train)))
print('Accuracy of test set: {:.2f}'.format(classifier5.score(X_test,y_test)))

In [127]:
from sklearn.tree import DecisionTreeClassifier
classifier6 = DecisionTreeClassifier(criterion = 'entropy')
classifier6.fit(X_train.get() , y_train.get())
print('Using Decision tree classifier:')
print('Accuracy of training set: {:.2f}'.format(classifier6.score(X_train.get(),y_train.get())))
print('Accuracy of test set: {:.2f}'.format(classifier6.score(X_test.get(),y_test.get())))

#### xgboost

In [128]:
import xgboost as xgb

In [129]:
train = xgb.DMatrix(new_data.iloc[ : , : -1].values , label = new_data.iloc[ : ,  -1].values)

In [130]:
from sklearn.metrics import roc_auc_score
X = new_data.iloc[ : , : -1].values
y = new_data.iloc[ : ,  -1].values

model = xgb.train({'nround':100,'max_depth':4,'max_leaves':2**4,'tree_method':'gpu_hist'}, dtrain=train)
y_hat = model.predict(train)
auc = roc_auc_score(y.get() , y_hat)
print(auc)