How to load datasets form sklearn

In [None]:
from sklearn import datasets
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.tree import DecisionTreeClassifier

In [None]:
cancer = datasets.load_breast_cancer()
iris = datasets.load_iris()
diabetes  = datasets.load_diabetes()
wine = datasets.load_wine

In [None]:
X = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y  = pd.DataFrame(iris.target)

How to apply standardization (also known as z-score normalization)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

How to split the data and train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, random_state=20)

How to build a classfication model

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score
from sklearn.metrics import classification_report, precision_score, roc_auc_score

How to find all metrics to evaluate a classfication model

In [None]:
cl = classification_report(y_test, y_pred)
print(cl)

In [None]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
recall = recall_score(y_test, y_pred, average="weighted")
print(recall)

In [None]:
precision = precision_score(y_test, y_pred, average='weighted')
print(precision)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

In [None]:
f1 = f1_score(y_test, y_pred, average='weighted')
print(f1)


For regression

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
dfd = pd.DataFrame(data= diabetes.data, columns=diabetes.feature_names)
dfd["target"] = diabetes.target

In [None]:
dfd.columns

In [None]:
X = dfd.drop("target", axis = 1)
y = dfd["target"]

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
r2 = r2_score(y_test, y_pred)
print(r2)

In [None]:
mae = mean_absolute_error(y_test, y_pred)
print(mae)

In [None]:
mse = mean_squared_error(y_test, y_pred)
print(mse)

In [None]:
rmse = np.sqrt(mse)
print(rmse)

How to apply grid search to tune the hyperparameters

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {
    'fit_intercept': [True, False],
    'n_jobs': [1,2]
}


In [None]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [None]:
print("Best Hyperparameters:", best_params)
print("Mean Squared Error:", mse)

K-Means clustering, Non - Hierarchical Clustering

In [None]:
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans()

kmeans.fit(X)

In [None]:
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

In [None]:
print("Cluster Labels:", labels)
print("Centroids:", centroids)

Agglomerative (bottom-up) - Hierarchical clustering

In [None]:
from sklearn import datasets
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch

In [None]:
iris = datasets.load_iris()
X = iris.data

In [None]:
clustering = AgglomerativeClustering(n_clusters=3, linkage='ward')
labels = clustering.fit_predict(X)

In [None]:
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title("Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.show()

In [None]:
print("Cluster Labels:", labels)

Apriori algorithm

In [None]:
import pandas as pd
import numpy as np
import mlxtend
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

transactions = [['bread', 'milk', 'eggs'],
                ['bread', 'coffee'],
                ['bread', 'milk', 'coffee', 'eggs'],
                ['bread', 'milk', 'coffee'],
                ['bread', 'milk'],
                ['bread', 'milk', 'coffee'],
                ['bread'],
                ['bread', 'milk', 'coffee', 'eggs']]

# Transform the transaction data into a binary encoded format
encoder = TransactionEncoder()
transaction_array = encoder.fit_transform(transactions)
df = pd.DataFrame(transaction_array, columns=encoder.columns_)

# Apply the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.2, use_colnames=True)

# Generate association rules from the frequent itemsets
association_rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.6)

print("Frequent Itemsets:")
print(frequent_itemsets)

print("\nAssociation Rules:")
print(association_rules)


One Hot Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

data = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red', 'Yellow']})

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[['Color']])

# Convert the encoded data to a DataFrame
encoded_df = pd.DataFrame(encoded_data.toarray(), columns=encoder.get_feature_names_out(['Color']))

final_df = pd.concat([data, encoded_df], axis=1)
print(final_df)


Label Encoding

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = pd.DataFrame({'Color': ['Red', 'Blue', 'Green', 'Red', 'Yellow']})

encoder = LabelEncoder()
encoded_data = encoder.fit_transform(data['Color'])
encoded_data = pd.DataFrame(data=encoded_data, columns=['Encoded_Color'])

df = pd.concat([data, encoded_data], axis=1)
print(df)


How to calculate the p-value

In [None]:
import scipy.stats as stats

sample1 = [1, 2, 3, 4, 5]
sample2 = [2, 4, 6, 8, 10]

# Perform independent t-test
t_statistic, p_value = stats.ttest_ind(sample1, sample2)

print("T-Statistic:", t_statistic)
print("P-Value:", p_value*100)


How to apply bagging ensemble technique 

In [None]:
from sklearn.ensemble import BaggingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create individual base classifiers
classifier1 = DecisionTreeClassifier()
classifier2 = SVC(kernel='linear')
classifier3 = SVC(kernel='rbf')

# Create a list of base classifiers
base_classifiers = [('dt', classifier1), ('svc_linear', classifier2), ('svc_rbf', classifier3)]

# Create a Voting classifier from the base classifiers
voting_classifier = VotingClassifier(estimators=base_classifiers)

# Create a Bagging ensemble of Voting classifiers
bagging_classifier = BaggingClassifier(voting_classifier, n_estimators=10)

bagging_classifier.fit(X_train, y_train)

predictions = bagging_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


How to apply boosting ensemble technique 

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset (example: Iris dataset)
data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a base classifier (example: Decision Tree)
base_classifier = DecisionTreeClassifier()

# Create an AdaBoost ensemble classifier with the base classifier
boosting_classifier = AdaBoostClassifier(base_classifier, n_estimators=10)

boosting_classifier.fit(X_train, y_train)

predictions = boosting_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


How to apply Stacking ensemble technique

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

data = load_iris()
X = data.data
y = data.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create the base classifiers
base_classifiers = [
    ('dt', DecisionTreeClassifier()),
    ('lr', LogisticRegression())
]

# Create the meta-classifier
meta_classifier = LogisticRegression()

# Create the stacking ensemble classifier
stacking_classifier = StackingClassifier(estimators=base_classifiers, final_estimator=meta_classifier)

stacking_classifier.fit(X_train_scaled, y_train)

predictions = stacking_classifier.predict(X_test)

accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


How to perform Winsorization to treat outliers

In [2]:
from feature_engine.outliers import Winsorizer
import pandas as pd

data = pd.DataFrame({
    'col1': [10, 20, 25, 30, 100, 200, 250, 300],
    'col2': [5, 15, 20, 25, 30, 150, 250, 500]
})

# Apply winsorization to the dataset using IQR
winsorizer = Winsorizer(capping_method='iqr', tail='both', fold=1.5)
winsorized_data = winsorizer.fit_transform(data)

print("Original data:")
print(data)
print("\nWinsorized data:")
print(winsorized_data)


Original data:
   col1  col2
0    10     5
1    20    15
2    25    20
3    30    25
4   100    30
5   200   150
6   250   250
7   300   500

Winsorized data:
   col1     col2
0    10    5.000
1    20   15.000
2    25   20.000
3    30   25.000
4   100   30.000
5   200  150.000
6   250  250.000
7   300  409.375
