In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
# Load dataset
df = pd.read_csv('voice.csv')

# Display first few rows
print(df.head())

# Encode the target variable 'label'
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])  # female=0, male=1

# Split features and target
X = df.drop('label', axis=1)
y = df['label']

# Split dataset: 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Create and train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
# Compute correlation matrix
corr_matrix = X.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title('Correlation Matrix')
plt.show()


In [None]:
# Find correlated features (correlation > 0.9)
def get_correlated_features(corr_matrix, threshold=0.9):
    correlated_features = set()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                correlated_features.add(colname)
    return correlated_features

high_corr_features = get_correlated_features(corr_matrix, threshold=0.9)
print(f"Highly Correlated Features to Remove: {high_corr_features}")

# Drop highly correlated features
X_reduced = X.drop(columns=high_corr_features)

# Split again
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_reduced, y, test_size=0.2, random_state=42)

# Fit logistic regression on reduced dataset
log_reg_reduced = LogisticRegression(max_iter=1000)
log_reg_reduced.fit(X_train_r, y_train_r)

# Predict and measure accuracy
y_pred_r = log_reg_reduced.predict(X_test_r)
reduced_accuracy = accuracy_score(y_test_r, y_pred_r)
print(f"Reduced Model Test Accuracy: {reduced_accuracy:.4f}")
