# Supervised learning 1
## Use-Case 1: 
* Fit a model using binary classification using logistic regression. 
* Identify correlated variables and form a less complex model. 

In [1]:
# import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.linear_model import LogisticRegression

# For Evaluation 
import sklearn.metrics 

In [2]:
# Reading the dataset using pandas
data=pd.read_csv('voice.csv')
data.info()
data.head()

NameError: name 'pd' is not defined

In [None]:
data.isnull().sum()

In [None]:
# Label Encosing
le = LabelEncoder()
data['label']=le.fit_transform(data['label'])
print(data.head())

In [None]:
data.info()

In [None]:
# #Divide the dataset into independent and dependent variables
x=data.drop('label',axis=1)
y=data['label']
print(x)
print(y)

In [None]:
# Train Test Split 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2) 
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)
print(x_train.head())
print(x_test.head())

In [None]:
# Scaling the Features: Logistic regression performs better when the features are on a similar scale. 
# Standardize the features using StandardScaler before training:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Train Logistic regression model
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)

In [None]:
# Checking prediction accuracy (Known data)
print(log_reg)
y_pred=log_reg.predict(x_train)
print(y_pred)
print("Train accuracy: ", sklearn.metrics.accuracy_score(y_train,y_pred))

In [None]:
# Checking prediction accuracy (UnKnown data)
y_pred=log_reg.predict(x_test)
print(y_pred)
print("Test accuracy: ", sklearn.metrics.accuracy_score(y_test,y_pred))

In [None]:
corr = data.corr()
plt.figure(figsize=(14,14))
sns.heatmap(corr, cbar = True,  square = True,
            cmap= 'coolwarm')
plt.show()

In [None]:
corr = x.corr()
plt.figure(figsize=(14,14))
sns.heatmap(corr, cbar = True,  square = True,
            cmap= 'coolwarm')
plt.show()

In [None]:
# Features for the model ( remove collinearity)
# Removing multicollinearity helps achieve more stable, interpretable, and reliable coefficients.
# When highly correlated features are present in a logistic regression model, it can cause instability in the model’s coefficients.
# Instability increases the variance of the model, making it sensitive to small changes in the data and leading to a less generalizable model
# Consistent Feature Selection: Automate this feature selection by dropping one variable from each highly correlated pair. 
high_corr = corr[corr.abs() > 0.8]  # Using 0.8 as threshold for high correlation
correlated_features = set()
for i in range(len(high_corr.columns)):
    for j in range(i):
        if abs(high_corr.iloc[i, j]) > 0.8:  # Identify pairs above threshold
            colname = high_corr.columns[i]
            correlated_features.add(colname)

print("Correlated features: ", correlated_features)
x_reduced = x.drop(labels=correlated_features, axis=1)
print("Remaining features in x_reduced:", x_reduced.columns.tolist())

# Removing highly correlated features simplifies the model, stabilizes the logistic regression coefficients, and enhances generalization. 
# This approach ensures that the model is interpreting each feature independently, providing a clearer and more robust relationship with the target variable.

In [None]:
# Train Test Split 
x_train,x_test,y_train,y_test=train_test_split(x_reduced,y,test_size=0.2) 
print(x_train.shape,x_test.shape)
print(y_train.shape,y_test.shape)

In [None]:
# Scaling the Features: Logistic regression performs better when the features are on a similar scale. 
# Standardize the features using StandardScaler before training:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# Train Logistic regression model
log_reg = LogisticRegression()
log_reg.fit(x_train,y_train)
print(log_reg.coef_)

In [None]:
# Checking prediction accuracy (Known data)
print(log_reg)
y_pred=log_reg.predict(x_train)
print(y_pred)
print("Reduced Model Train accuracy: ", sklearn.metrics.accuracy_score(y_train,y_pred))

In [None]:
# Checking prediction accuracy (UnKnown data)
y_pred=log_reg.predict(x_test)
print(y_pred)
print("Reduced Model Test accuracy: ", sklearn.metrics.accuracy_score(y_test,y_pred))

In [None]:
# Adding Cross-Validation: To further validate the model, you might consider using cross-validation to ensure that the model’s performance is consistent. 
# Use cross_val_score from sklearn.model_selection:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, x_reduced, y, cv=5)
print("Cross-validated scores on reduced model:", scores)
print("Mean cross-validation score:", scores.mean())


### Cross-validation scores
The cross-validation scores obtained represent the accuracy of the logistic regression model on the reduced feature set across each fold in a 5-fold cross-validation.
* Cross-validation helps ensure that the model’s performance is consistent across different subsets of the data, reducing the chance of overfitting or underfitting.
* It provides a more robust measure of model accuracy than a single train-test split, especially for smaller datasets or when evaluating model stability.In your case, an average score of approximately 84.25% suggests that the model is fairly accurate with the reduced feature set, though the score may be slightly lower than when using the full feature set.
* Variations between the scores (e.g., 0.7003 in one fold versus 0.9385 in another) may indicate that model performance varies depending on the data split. Consistency across scores usually indicates more stable performance.
* The mean cross-validation score, 0.8425 (or about 84.25%), is the average accuracy across all five folds.
* This value gives a good estimate of how well the model is expected to perform on unseen data, providing a more reliable measure than a single train-test split.