In [1]:
# Importing the required libraries for data analysis and visualization
# numpy: For numerical computations and array operations
# matplotlib.pyplot: For creating static visualizations and plots
# pandas: For data manipulation and analysis
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Importing the dataset
# Reading the social media dataset from CSV file
# X contains features: Age and Estimated Salary
# y contains target variable: Purchased (0 = No, 1 = Yes)
dataset = pd.read_csv("Social_data.csv")
X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [3]:
# Splitting the dataset into the Training set and Test set
# - Using train_test_split from sklearn.model_selection
# - test_size=0.25 means 75% data for training, 25% for testing
# - random_state=0 ensures reproducible results
# - X_train, X_test: Features split for training and testing
# - y_train, y_test: Target variables split for training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [4]:
# Feature Scaling
# - StandardScaler is used to normalize features to have zero mean and unit variance
# - This helps prevent features with larger scales from dominating the model
# - fit_transform() computes mean and std for training data and applies transformation
# - transform() applies the same transformation to test data using training data's parameters
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Fitting Logistic Regression to the Training set
# - Importing LogisticRegression from sklearn.linear_model
# - Creating a LogisticRegression classifier with random_state=0 for reproducibility
# - Fitting the model to the training data (X_train, y_train)
# - The model learns the relationship between features (Age, Salary) and target (Purchased)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state = 0)
log_reg.fit(X_train, y_train)

In [6]:
# Predicting the test set results using the trained logistic regression model
# - log_reg.predict() uses the fitted model to predict class labels (0 or 1) for test data
# - X_test contains the scaled test features (Age and Salary)
# - y_pred will contain predicted purchase decisions (0 = No, 1 = Yes)
y_pred = log_reg.predict(X_test)

In [7]:
# Importing confusion_matrix from sklearn.metrics
# - confusion_matrix is used to evaluate classification model performance
# - It creates a matrix showing true positives, false positives, true negatives, and false negatives
# - This helps in understanding where the model makes correct and incorrect predictions
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Visualising the Training set results
# - This code creates a visualization of the logistic regression model's decision boundary on the training data
# - Uses meshgrid to create a grid of points covering the feature space
# - Plots the decision boundary using contourf to show regions where the model predicts class 0 (red) or class 1 (green)
# - Scatters the actual training data points, colored by their true class
# - Adds appropriate labels, title, and legend for clear interpretation
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                    np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, log_reg.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
            alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
   plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
               c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
# Visualising the Test set results
# - This code creates a visualization of the logistic regression model's decision boundary on the test data
# - Uses meshgrid to create a grid of points covering the feature space
# - Plots the decision boundary using contourf to show regions where the model predicts class 0 (red) or class 1 (green)
# - Scatters the actual test data points, colored by their true class
# - Adds appropriate labels, title, and legend for clear interpretation
# - This visualization helps evaluate how well the model generalizes to unseen data
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, log_reg.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [10]:
# Logistic Regression
# - This section implements a binary classification model using Logistic Regression
# - The model predicts whether a customer will purchase a product based on their age and estimated salary
# - Features (X): Age and Estimated Salary
# - Target (y): Purchase decision (0 = No, 1 = Yes)
# - The data is split into training and test sets for model evaluation
# - Feature scaling is applied to normalize the input variables
# - The model's performance is evaluated using confusion matrix and visualization

X = dataset.iloc[:, [2, 3]].values
y = dataset.iloc[:, 4].values

In [11]:
# Import train_test_split from scikit-learn's model_selection module
# This function splits arrays or matrices into random train and test subsets
# - X: Features array (Age and Estimated Salary)
# - y: Target array (Purchase decision)
# - test_size: Proportion of dataset to include in the test split (0.25 = 25%)
# - random_state: Controls the shuffling of the data for reproducibility
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [12]:
# Import StandardScaler from scikit-learn's preprocessing module
# StandardScaler standardizes features by removing the mean and scaling to unit variance
# This is crucial for logistic regression as it:
# - Centers the data around zero
# - Scales the features to have unit variance
# - Helps prevent features with larger scales from dominating the model
# - Improves convergence of the optimization algorithm
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
# Import LogisticRegression from scikit-learn's linear_model module
# LogisticRegression is a linear model for binary classification that:
# - Uses the logistic function (sigmoid) to model probability
# - Estimates the probability of a binary outcome
# - Provides linear decision boundaries
# - Handles both binary and multi-class classification
# - Supports various regularization methods (L1, L2, elastic net)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(random_state = 0)
log_reg.fit(X_train, y_train)

In [14]:
# Make predictions on the test set using the trained logistic regression model
# - X_test: Scaled test features (Age and Estimated Salary)
# - y_pred: Predicted class labels (0 or 1) for each test sample
# - log_reg.predict(): Returns the most likely class for each sample
y_pred = log_reg.predict(X_test)

In [15]:
# Import confusion_matrix from scikit-learn's metrics module
# Confusion matrix is a performance measurement for machine learning classification problems
# It shows:
# - True Positives (correctly predicted positive class)
# - True Negatives (correctly predicted negative class)
# - False Positives (incorrectly predicted positive class)
# - False Negatives (incorrectly predicted negative class)
# This helps evaluate the model's accuracy and identify types of errors
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [None]:
# Import ListedColormap from matplotlib.colors
# ListedColormap is used to create custom colormaps for visualization:
# - Takes a list of colors as input
# - Creates a colormap object that maps values to colors
# - Useful for creating discrete color schemes
# - In this case, used to create a binary colormap (red and green) for classification visualization
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, log_reg.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()

In [None]:
# Import ListedColormap from matplotlib.colors
# ListedColormap is used to create custom colormaps for visualization:
# - Takes a list of colors as input
# - Creates a colormap object that maps values to colors
# - Useful for creating discrete color schemes
# - In this case, used to create a binary colormap (red and green) for classification visualization
from matplotlib.colors import ListedColormap
X_set, y_set = X_test, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, log_reg.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                c = ListedColormap(('red', 'green'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.legend()
plt.show()
