
# Module 9: Supervised Learning II
## Case Study – 1

### Objective: 
 * Classifying Voice Samples as Male or Female. 


In [3]:
import pandas as pd

# Step 1: Load and Explore the Data
# We start by examining the dataset for missing values, data types, and general structure. 
# From the given description, no missing values are present.

# Load the dataset
data = pd.read_csv('voice-classification.csv')

# Check basic info
print(data.info())
print(data.isnull().sum())

# Preview the data
print(data.head())

print(data.describe())

# Count of each class in the target
print(data['label'].value_counts())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

In [16]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score



# Encode the target variable
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])  # male=1, female=0

# Split features and target
X = data.drop(columns=['label'])
y = data['label']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Initialize Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
#  By default, it uses the CART (Classification and Regression Tree) algorithm, which employs the Gini Index as the criterion for splitting unless explicitly specified otherwise.
#Default Parameters of RandomForestClassifier
#Criterion: 'gini' (splits are based on minimizing the Gini Impurity).
#Max Depth: None (trees are grown until all leaves are pure or contain less than min_samples_split samples).
#Number of Estimators (n_estimators): 100 (the number of trees in the forest).
#Min Samples Split: 2 (minimum samples required to split a node).
#Min Samples Leaf: 1 (minimum samples required in a leaf node).
#Bootstrap: True (samples are drawn with replacement for building each tree).
#Max Features: sqrt (number of features considered for the best split

# Train the model
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred = rf_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy: ", accuracy) 
print("\nClassification report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)


Accuracy:  0.9842271293375394

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       317
           1       0.98      0.99      0.98       317

    accuracy                           0.98       634
   macro avg       0.98      0.98      0.98       634
weighted avg       0.98      0.98      0.98       634

Confusion Matrix:
 [[311   6]
 [  4 313]]


In [12]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    verbose=1,
    n_jobs=-1  # Use all available processors
)

# Perform the grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score


Fitting 5 folds for each of 324 candidates, totalling 1620 fits


  _data = np.array(data, dtype=dtype, copy=copy,


({'max_depth': 10,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 200},
 np.float64(0.9779038130208699))

In [15]:
# Train the Random Forest model with optimal hyperparameters
optimized_rf_model = RandomForestClassifier(
    max_depth=10,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=5,
    n_estimators=200,
    random_state=42
)
optimized_rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred = optimized_rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy: ", accuracy) 
print("\nClassification report:\n", classification_rep)
print("Confusion Matrix:\n", conf_matrix)

Accuracy:  0.9842271293375394

Classification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       317
           1       0.98      0.99      0.98       317

    accuracy                           0.98       634
   macro avg       0.98      0.98      0.98       634
weighted avg       0.98      0.98      0.98       634

Confusion Matrix:
 [[311   6]
 [  4 313]]


In [17]:
# Retrieve feature importances from the optimized Random Forest model
feature_importances = optimized_rf_model.feature_importances_

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

importance_df

Unnamed: 0,Feature,Importance
12,meanfun,0.356969
5,IQR,0.223698
3,Q25,0.140238
1,sd,0.071615
8,sp.ent,0.038153
9,sfm,0.029485
0,meanfreq,0.018355
11,centroid,0.017907
10,mode,0.016404
2,median,0.01366


In [18]:
# Cross-validation for default Random Forest model
default_rf_cv_scores = cross_val_score(rf_model, X_scaled, y, cv=5, scoring='accuracy')

# Cross-validation for optimized Random Forest model
optimized_rf_cv_scores = cross_val_score(optimized_rf_model, X_scaled, y, cv=5, scoring='accuracy')

# Compare mean accuracy from cross-validation
default_mean_accuracy = default_rf_cv_scores.mean()
optimized_mean_accuracy = optimized_rf_cv_scores.mean()

default_mean_accuracy, optimized_mean_accuracy

(np.float64(0.9652797504248459), np.float64(0.9662271193704808))