<a href="https://colab.research.google.com/github/allakoala/data_science/blob/main/colab_notebooks/Clustering_Homework_(part_2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#HW: https://drive.google.com/file/d/1o6wN90vZ6GXwbtM0I3TwdXO8NPCN9vWK/view

Dataset: https://drive.google.com/file/d/1iNwpwoIdyXJzzq16WhAG_nXp9Gdd7ZSJ/view?usp=sharing

Dataset description: https://www.dbs.ifi.lmu.de/research/outlier-evaluation/DAMI/semantic/Cardiotocography/Cardiotocography_22.html

The given data has 16880 rows and 23 columns. Each row represents a record of a cardiotocogram (CTG) examination, which is a fetal monitoring test performed during pregnancy. The columns provide various measurements and features of the CTG test, such as the baseline fetal heart rate, accelerations, decelerations, and other parameters.

Some of the important columns and their descriptions are as follows:

- LB: FHR baseline (beats per minute)
- AC: # of accelerations per second
- FM: # of fetal movements per second
- UC: # of uterine contractions per second
- ASTV: percentage of time with abnormal short-term variability
- MSTV: mean value of short-term variability
- ALTV: percentage of time with abnormal long-term variability
- MLTV: mean value of long-term variability
- DL: # of light decelerations per second
- DS: # of severe decelerations per second
- Min: minimum of FHR histogram
- Max: Maximum of FHR histogram
- Nmax: # of histogram peaks
- Nzeros: # of histogram zeros
- Mode: histogram mode
- Mean: histogram mean
- Median: histogram median
- Variance: histogram variance
- Tendency: histogram tendency
- outlier: whether a record is an outlier or not

Q-s: LOF;
Reachability distance;
Local reachability density
https://towardsdatascience.com/local-outlier-factor-lof-algorithm-for-outlier-identification-8efb887d9843

In [None]:
#basics
import pandas as pd
import numpy as np
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#path of the file to read
url = "/content/drive/MyDrive/Colab Notebooks/Cardiotocography.csv"

#read the file into a variable
data = pd.read_csv(url, sep=',')

#examine the data
data

#EDA/ data preparation

In [None]:
#find duplicate rows
duplicate_rows = data.duplicated(subset=data.columns, keep="first")
duplicate_rows.sum()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Initialize a LabelEncoder object
le = LabelEncoder()

# Fit and transform the feature using LabelEncoder
data['outlier_encoded'] = le.fit_transform(data['outlier'])

# Print the original feature and its label-encoded version
print("Original feature: ", data['outlier'])
print("Label-encoded feature: ", data['outlier_encoded'])

In [None]:
data['outlier_encoded'] = (data['outlier_encoded']).astype(int)
data.drop(['outlier', 'id'], axis=1, inplace=True)
print("Label-encoded feature: ", data['outlier_encoded'])

In [None]:
print(data.info())
print(data.describe())

In [None]:
#for each dataset column print unique values
for col in data.columns:
    n_unique_values = data[col].nunique()
    unique_values = data[col].unique()
    print(f"{col}: {n_unique_values}: {unique_values}")

In [None]:
data['outlier_encoded'].value_counts()

#PLOTS

In [None]:
#missing data for each variable and way to handle it. missing data can imply a reduction of the sample size

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
total_2 = data.isna().sum().sort_values(ascending=False)
percent_2 = (data.isna().sum()/data.isna().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent, total_2, percent_2], axis=1, keys=['Tota_null', 'Percent_null', 'Total_na', 'Percent_na'])
missing_data

In [None]:
#the histogram
for col in data.columns:
    sns.histplot(data=data, x=col, kde=True)
    plt.show()

#lists of columns with specified characteristics
stats = data.describe()
print(f"\nColumns with mean < median:\n{stats.columns[stats.loc['mean'] < stats.loc['50%']]}")
print(f"\nColumns with mean > median:\n{stats.columns[stats.loc['mean'] > stats.loc['50%']]}")
print(f"\nColumns with big difference between (75th %tile and max) or/and (25th %tile and min values):\n{stats.columns[((stats.loc['75%'] - stats.loc['max']).abs() > 100) | ((stats.loc['25%'] - stats.loc['min']).abs() > 100)]}")
print(f"\nColumns with high standard deviation:\n{stats.columns[stats.loc['std'] > 100]}")
print(f"\nColumns with low standard deviation:\n{stats.columns[stats.loc['std'] < 0.1]}")

In [None]:
#scatterplot method
sns.set()
sns.pairplot(data, size = 2.5)
plt.show();

In [None]:
#boxplot

#outliers detection

#Interquartile Range (IQR) method (values outside the normal range)
for col in data.columns:
    Q1 = data[col].quantile(0.25)
    Q3 = data[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    outliers = data[(data[col] < lower) | (data[col] > upper)]
    print(f"{col} has {len(outliers)} outliers")

#red lines in the boxplot indicate the lower and upper limits of the normal range (calculated using the IQR method), and any points outside of these lines are considered outliers
for col in data.columns:
    fig, ax = plt.subplots()
    ax.boxplot(data[col])
    ax.set_title(f"{col} Distribution")
    ax.set_ylabel(col)
    ax.axhline(y=lower, color='r', linestyle='-', label='Lower Limit')
    ax.axhline(y=upper, color='g', linestyle='-', label='Upper Limit')
    ax.text(0.75, lower, f"{lower:.2f}", va='center', ha='center', bbox=dict(facecolor='red', alpha=0.5), fontsize=12)
    ax.text(0.75, upper, f"{upper:.2f}", va='center', ha='center', bbox=dict(facecolor='green', alpha=0.5), fontsize=12)
    ax.legend()
    plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# create empty lists for positive and negative slopes
positive_slope_plots = []
negative_slope_plots = []

for i in range(len(data.columns)):
    for j in range(i+1, len(data.columns)):
        # calculate the slope of the regression line
        slope = data[data.columns[j]].corr(data[data.columns[i]])

        # create scatter plot with color-coded points
        sns.scatterplot(x=data.columns[i], y=data.columns[j], data=data)

        # add regression line
        sns.regplot(x=data.columns[i], y=data.columns[j], data=data, scatter=False, color="black")

        # set plot title and axis labels
        plt.title(f"{data.columns[i]} vs {data.columns[j]}")
        plt.xlabel(data.columns[i])
        plt.ylabel(data.columns[j])

        # append plot and feature pair to the appropriate list based on the sign of the slope
        if slope > 0:
            positive_slope_plots.append((f"{data.columns[i]} vs {data.columns[j]}", plt))
        elif slope < 0:
            negative_slope_plots.append((f"{data.columns[i]} vs {data.columns[j]}", plt))

        # display plot
        plt.show()

        # clear current figure to free up memory
        plt.clf()

# print out the lists of feature pairs
print("Feature pairs with positive slopes:")
for feature_pair, plot in positive_slope_plots:
    print(f"{feature_pair}: {plot}")

print("\nFeature pairs with negative slopes:")
for feature_pair, plot in negative_slope_plots:
    print(f"{feature_pair}: {plot}")

In [None]:
#heatmap style
sns.set(style='darkgrid')
corrmat = data.corr()
f, ax = plt.subplots(figsize=(20, 10))
sns.heatmap(corrmat, annot=True, fmt=".2f", cmap='coolwarm', vmax=.8, square=True)
plt.show()

# print highly positively correlated pairs
pos_corr_pairs = []
for i in range(len(corrmat.columns)):
    for j in range(i+1, len(corrmat.columns)):
        if abs(corrmat.iloc[i, j]) >= 0.5:
            pos_corr_pairs.append((corrmat.columns[i], corrmat.columns[j]))

print("Highly positively correlated pairs:")
for pair in pos_corr_pairs:
    print(pair)

# print highly negatively correlated pairs
neg_corr_pairs = []
for i in range(len(corrmat.columns)):
    for j in range(i+1, len(corrmat.columns)):
        if abs(corrmat.iloc[i, j]) <=  -0.5:
            neg_corr_pairs.append((corrmat.columns[i], corrmat.columns[j]))

print("Highly negatively correlated pairs:")
for pair in neg_corr_pairs:
    print(pair)

#Feature engineering
For the following features:
1. LB
2. ASTV
3. ALTV
4. DS

In [None]:
from sklearn.preprocessing import PowerTransformer, MinMaxScaler

# Scale the variables to have the same range
scaler = MinMaxScaler()
data[["LB", "ASTV", "ALTV", "DS"]] = scaler.fit_transform(data[["LB", "ASTV", "ALTV", "DS"]])

# Create new features by applying non-linear transformations
data["LB2"] = data["LB"] ** 2
data["sqrt_LB"] = np.sqrt(data["LB"])

data["ASTV2"] = data["ASTV"] ** 2
data["sqrt_ASTV"] = np.sqrt(data["ASTV"])

data["ALTV2"] = data["ALTV"] ** 2
data["sqrt_ALTV"] = np.sqrt(data["ALTV"])

data["DS2"] = data["DS"] ** 2
data["sqrt_DS"] = np.sqrt(data["DS"])

data["LB_ASTV_ALTV_DS"] = data["LB"] * data["ASTV"] * data["ALTV"] * data["DS"]

#Mahalanobis rule
The Mahalanobis distance is a measure of the distance between a point and a distribution. It takes into account the covariance structure of the dataset, unlike the Euclidean distance, which treats each variable independently. The Mahalanobis distance is calculated as the distance between a point and the mean of the distribution, normalized by the covariance matrix.

In [None]:
# Split data into features and target
X = data.drop('outlier_encoded', axis=1)
y = data['outlier_encoded']
X.columns

In [None]:
#Calculate the mean and covariance matrix of the dataset for each class
mean_vec = []
cov_matrix = []

for i in [0, 1]:
    X_i = X[y == i]
    mean_vec.append(np.mean(X_i, axis=0))
    cov_matrix.append(np.cov(X_i, rowvar=False))

#Calculate the inverse of the covariance matrix for each class
inv_cov_matrix = []
for i in [0, 1]:
    try:
        inv_cov_matrix.append(np.linalg.inv(cov_matrix[i]))
    except np.linalg.LinAlgError as e:
        print(f"Singular matrix error: {e}")

#Calculate the Mahalanobis distance for each data point
mahalanobis_dist = []
for x, class_idx in zip(X.values, y.values):
    x_minus_mean = x - mean_vec[class_idx]
    try:
        md = np.sqrt(np.dot(np.dot(x_minus_mean, inv_cov_matrix[class_idx]), x_minus_mean.T))
    except IndexError:
        md = float('inf')
    mahalanobis_dist.append(md)

#Create a DataFrame with the Mahalanobis distances and the y feature
df = pd.DataFrame({'Mahalanobis Distance': mahalanobis_dist, 'outlier_encoded': y})

#Set a threshold for the Mahalanobis distance for each row
threshold = np.quantile(df['Mahalanobis Distance'], 0.99)

#create a new column 'mr_outlier' and set 1 to that column for each distance which is more than the threshold, otherwise - 0
df['mr_outlier'] = np.where(df['Mahalanobis Distance'] > threshold, 1, 0)

#Print out the first 20 highest Mahalanobis distances
df_sorted = df.sort_values(by='Mahalanobis Distance', ascending=False)
print(df_sorted.head(30))

#Calculate accuracy by comparing mr_outlier and outlier_encoded column
accuracy = (df['mr_outlier'] == df['outlier_encoded']).sum() / len(df)
print("Accuracy:", accuracy)

In [None]:
# Print entries where 'mr_outlier' is not equal to 'outlier_encoded'
diff_entries = df[df['mr_outlier'] != df['outlier_encoded']]
print("Entries where 'mr_outlier' is not equal to 'outlier_encoded':")
print(diff_entries)
print("Count of such entries:", len(diff_entries))

#Supervised models:
a. SVM

b. Logreg

c. KNN


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('outlier_encoded', axis=1), data['outlier_encoded'], test_size=0.2, random_state=42)

# Print outlier_encoded values
print("outlier_encoded values:\n", y_train.value_counts(), "\n")

# Support Vector Machine (SVM) model
svm_pipeline = Pipeline([('scaler', StandardScaler()), ('svm', SVC())])
svm_pipeline.fit(X_train, y_train)
svm_pred = svm_pipeline.predict(X_test)

# Print accuracy, confusion matrix, precision, recall, and F1 score for SVM
print("Results for SVM:\nAccuracy:", accuracy_score(y_test, svm_pred), "\nConfusion Matrix:\n", confusion_matrix(y_test, svm_pred), "\nClassification Report:\n", classification_report(y_test, svm_pred), "\n")

# Number of entities where y_pred = 1 for SVM
svm_entities_pred_1 = sum(svm_pred == 1)
print("Number of entities where y_pred = 1 for SVM:", svm_entities_pred_1)

# First 15 indexes of entities where y_pred = 1 for SVM
svm_entities_pred_1_idx = list(filter(lambda idx: svm_pred[idx] == 1, range(len(svm_pred))))[:15]
print("First 15 indexes of entities where y_pred = 1 for SVM:", svm_entities_pred_1_idx)

# Logistic Regression (Logreg) model
logreg_pipeline = Pipeline([('scaler', StandardScaler()), ('logreg', LogisticRegression())])
logreg_pipeline.fit(X_train, y_train)
logreg_pred = logreg_pipeline.predict(X_test)

# Print accuracy, confusion matrix, precision, recall, and F1 score for Logreg
print("Results for Logreg:\nAccuracy:", accuracy_score(y_test, logreg_pred), "\nConfusion Matrix:\n", confusion_matrix(y_test, logreg_pred), "\nClassification Report:\n", classification_report(y_test, logreg_pred), "\n")

# Number of entities where y_pred = 1 for Logreg
logreg_entities_pred_1 = sum(logreg_pred == 1)
print("Number of entities where y_pred = 1 for Logreg:", logreg_entities_pred_1)

# First 15 indexes of entities where y_pred = 1 for Logreg
logreg_entities_pred_1_idx = list(filter(lambda idx: logreg_pred[idx] == 1, range(len(logreg_pred))))[:15]
print("First 15 indexes of entities where y_pred = 1 for Logreg:", logreg_entities_pred_1_idx)

# K-Nearest Neighbors (KNN) model
knn_pipeline = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())])
knn_pipeline.fit(X_train, y_train)
knn_pred = knn_pipeline.predict(X_test)

# Print accuracy, confusion matrix, precision, recall, and F1 score for KNN
print("Results for KNN:\nAccuracy:", accuracy_score(y_test, knn_pred), "\nConfusion Matrix:\n", confusion_matrix(y_test, knn_pred), "\nClassification Report:\n", classification_report(y_test, knn_pred), "\n")

# Number of entities where y_pred = 1 for KNN
knn_entities_pred_1 = sum(knn_pred == 1)
print("Number of entities where y_pred = 1 for KNN:", knn_entities_pred_1)

# First 15 indexes of entities where y_pred = 1 for KNN
knn_entities_pred_1_idx = list(filter(lambda idx: knn_pred[idx] == 1, range(len(knn_pred))))[:15]
print("First 15 indexes of entities where y_pred = 1 for KNN:", knn_entities_pred_1_idx)
print('')

# Common entities where y_pred = 1
svm_entities = set(svm_pred.nonzero()[0])
logreg_entities = set(logreg_pred.nonzero()[0])
knn_entities = set(knn_pred.nonzero()[0])
common_entities = svm_entities.intersection(logreg_entities, knn_entities)
print("Number of common entities where y_pred = 1 for all models:", len(common_entities))
print("First 15 indexes of common entities where y_pred = 1 for all models:", list(common_entities)[:15])

In [None]:
from sklearn.manifold import TSNE
from umap import UMAP
import matplotlib.pyplot as plt

# Create a list of all predictions and their corresponding model names
predictions = [(svm_pred, "SVM"), (logreg_pred, "Logreg"), (knn_pred, "KNN")]

# Iterate over all predictions and their names
for pred, name in predictions:
    # Apply t-SNE and UMAP
    tsne = TSNE(n_components=2, random_state=42)
    umap = UMAP(n_components=2, random_state=42)
    tsne_result = tsne.fit_transform(X_test)
    umap_result = umap.fit_transform(X_test)

    # Plot t-SNE visualization
    plt.figure(figsize=(10,10))
    plt.title(name + " t-SNE")
    plt.scatter(tsne_result[:, 0], tsne_result[:, 1], c=pred)
    plt.show()

    # Plot UMAP visualization
    plt.figure(figsize=(10,10))
    plt.title(name + " UMAP")
    plt.scatter(umap_result[:, 0], umap_result[:, 1], c=pred)
    plt.show()

#Unsupervised models:
a. One-class SVM

b. Isolation Forest

c. DBScan

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.cluster import DBSCAN

import numpy as np

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('outlier_encoded', axis=1), data['outlier_encoded'], test_size=0.2, random_state=42)

# Models
svm = OneClassSVM().fit(X_train, y_train)
iforest = IsolationForest().fit(X_train, y_train)
dbscan = DBSCAN().fit(X_train, y_train)

# Predictions
svm_preds = svm.predict(X_test)
iforest_preds = iforest.predict(X_test)
dbscan_preds = np.where(dbscan.fit_predict(X_test) == -1, 0, 1)

# Number of entities where y_pred = 1 detected by models
print('Number of entities where y_pred = 1 detected by models:')
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    print(f"{model_name}: {len(X_test[preds == 1])} instances")

# First 15 indexes of entities where y_pred = 1
print('\nFirst 15 indexes of entities where y_pred = 1:')
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    print(f"{model_name}: {np.where(preds == 1)[0][:15]}")

# Accuracy and Confusion Matrix per each model
print('\nAccuracy and Confusion Matrix per each model:')
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    preds_transformed = np.where(preds == -1, 0, 1) if model_name in ['SVM', 'Isolation Forest'] else preds
    print(f"{model_name}")
    print(f"Accuracy: {accuracy_score(y_test, preds_transformed):.2f}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, preds_transformed)}\n")

# Precision, Recall, F1score for each model
print('Precision, Recall, F1score for each model')
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    preds_transformed = np.where(preds == -1, 0, 1) if model_name in ['SVM', 'Isolation Forest'] else preds
    precision = precision_score(y_test, preds_transformed)
    recall = recall_score(y_test, preds_transformed)
    f1score = f1_score(y_test, preds_transformed)
    print(f"{model_name}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1score:.2f}\n")


In [None]:
# Number of common entities where y_pred = 1 for all 3 models
common_entities = set(np.where(svm_preds == 1)[0]).intersection(np.where(iforest_preds == 1)[0]).intersection(np.where(dbscan_preds == 1)[0])
print("Number of common entities where y_pred = 1 for all 3 models: {}".format(len(common_entities)))
print('')

# First 15 indexes of common entities where y_pred = 1 for all 3 models
print("First 15 indexes of common entities where y_pred = 1 for all 3 models:")
common_entities = list(common_entities)
for i in range(min(len(common_entities), 15)):
    print(common_entities[i])
print('')

In [None]:
from sklearn.manifold import TSNE
import umap #!pip install umap-learn

# Transform data using t-SNE
tsne = TSNE(n_components=2)
X_tsne = tsne.fit_transform(X_test)

# Transform data using UMAP
umap = umap.UMAP(n_components=2)
X_umap = umap.fit_transform(X_test)

# Visualize predictions using t-SNE
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    plt.figure(figsize=(8, 8))
    plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=['red' if pred == 1 else 'green' for pred in preds])
    plt.title(f"{model_name} - t-SNE")
    plt.show()

# Visualize predictions using UMAP
for preds, model_name in zip([svm_preds, iforest_preds, dbscan_preds], ['SVM', 'Isolation Forest', 'DBSCAN']):
    plt.figure(figsize=(8, 8))
    plt.scatter(X_umap[:, 0], X_umap[:, 1], c=['red' if pred == 1 else 'green' for pred in preds])
    plt.title(f"{model_name} - UMAP")
    plt.show()