In [None]:
# importing necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
data = pd.read_csv("../input/voicegender/voice.csv")
df = pd.DataFrame(data)
df.shape

In [None]:
df.head()

In [None]:
# showing column wise %ge of NaN values they contains 

for i in df.columns:
  print(i,"\t-\t", df[i].isna().mean()*100)


In [None]:
sns.countplot(x='label', data = df) 

In [None]:
# Since our last column(label: male/female) is categorical let's first convert it into numerical

from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
df['label'] = enc.fit_transform(df['label'].astype('str'))

df.head()

In [None]:
index = 0
label = ['Female', 'Male']

fig, ax = plt.subplots(nrows = 2, ncols = 1, figsize=(20,7))

for i in range(2):
  ax[i].plot(df[df['label'] == i].sample(1).iloc[0,:20])
  ax[i].set_title(label[i],)


In [None]:
cormap = df.corr()
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(cormap, annot = True)

In [None]:
# Simple Function to get the name of top most corelated attributes

def get_corelated_col(cor_dat, threshold): 
  # Cor_data to be column along which corelation to be measured 
  #Threshold be the value above wich of corelation to considered
  feature=[]
  value=[]

  for i ,index in enumerate(cor_dat.index):
    if abs(cor_dat[index]) > threshold:
      feature.append(index)
      value.append(cor_dat[index])

  df = pd.DataFrame(data = value, index = feature, columns=['corr value'])
  return df


In [None]:
top_corelated_values = get_corelated_col(cormap['label'], 0.30)
top_corelated_values

In [None]:
sns.pairplot(df[top_corelated_values.index], hue='label')


In [None]:
# Saparating features and labels 

X = df[list(top_corelated_values.index[:-1]) + [ 'kurt', 'maxdom', 'dfrange']]
Y = df['label']

In [None]:
# Scale the data to be between -1 and 1

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
X.head()

In [None]:
#now lets split data in test train pairs

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = train_test_split(X, Y)

In [None]:
# Initialising the SVM classifier 

from sklearn.svm import SVC
from sklearn import metrics

SVC().get_params()   # Hyperparamters of SVC

In [None]:
# Now let's make grid for tunning the hyperparametes

from sklearn.model_selection import GridSearchCV

C = np.arange(0.1, 2, 0.1)
kernel = ['linear', 'rbf', 'poly']
gamma = [0.01,0.02,0.03,0.04,0.05]


grid = {'C': C,
        'kernel': kernel,
        'gamma': gamma }

In [None]:
# Fitting SVM classifiers with hyperparameter tunned using grid search(cross validation with 10 folds)

from sklearn.svm import SVC
from sklearn import metrics

svc_grid = GridSearchCV(estimator = SVC(), param_grid = grid, cv = 10)
svc_grid.fit(X_train, y_train)

In [None]:
svc_grid.best_params_  # Best pairs of hyperparameters provided by grid search

In [None]:
# Prediction

y_pred = svc_grid.best_estimator_.predict(X_test)

pred_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
pred_df.head()

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(5,5))
sns.heatmap(mat, annot = True)

In [None]:
from sklearn import metrics

# Generate the roc curve using scikit-learn.
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=1)
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

# Measure the area under the curve.  The closer to 1, the "better" the predictions.
print("AUC of the predictions: {0}".format(metrics.auc(fpr, tpr)))

# Measure the Accuracy Score
print("Accuracy score of the predictions: {0}".format(metrics.accuracy_score(y_pred, y_test)))


In [None]:
x_min, x_max = X['meanfun'].min() - 1, X['meanfun'].max() + 1
y_min, y_max = X['centroid'].min() - 1, X['centroid'].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.02),
                     np.arange(y_min, y_max, 0.02))

svc_plot = SVC(C = 1.8000000000000003, gamma = 0.05, kernel = 'rbf')
svc_plot.fit(X[['meanfun', 'centroid']], Y)
Z = svc_plot.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.contourf(xx, yy, Z, cmap = plt.cm.coolwarm, alpha=0.8)
plt.scatter(X['meanfun'], X['centroid'], c = Y, cmap = plt.cm.coolwarm)
plt.xlabel('Meanfun')
plt.ylabel('Centroid')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xticks(())
plt.yticks(())
plt.title("Support Vector Machine (Classifier)")