<a href="https://colab.research.google.com/github/amirabbasgashtil/data-mining-course/blob/main/notebooks/Han_chapter6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Han - classification, basic concepts and methods - chapter 6

## decision tree induction
Decision tree induction is the learning of decision trees from class-labeled training tuples. A decision
tree is a flowchart-like tree structure.

In [1]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text

wine = datasets.load_wine()
X = wine.data
y = wine.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)

accuracy = classifier.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

tree_rules = export_text(classifier, feature_names=wine.feature_names)
print("Decision Tree Rules:\n", tree_rules)

Accuracy: 0.94
Decision Tree Rules:
 |--- color_intensity <= 3.82
|   |--- proline <= 1002.50
|   |   |--- ash <= 3.07
|   |   |   |--- class: 1
|   |   |--- ash >  3.07
|   |   |   |--- class: 0
|   |--- proline >  1002.50
|   |   |--- class: 0
|--- color_intensity >  3.82
|   |--- flavanoids <= 1.40
|   |   |--- class: 2
|   |--- flavanoids >  1.40
|   |   |--- proline <= 724.50
|   |   |   |--- malic_acid <= 3.92
|   |   |   |   |--- class: 1
|   |   |   |--- malic_acid >  3.92
|   |   |   |   |--- class: 0
|   |   |--- proline >  724.50
|   |   |   |--- class: 0



## Attribute selection measures
An attribute selection measure is a heuristic for selecting the splitting criterion that “best” separates
a given data partition, D, of class-labeled training tuples into individual classes.

**Information Gain** Measures how much "information" or "uncertainty" is reduced by choosing a particular attribute.

**Gain Ratio** A refinement of information gain that avoids a bias toward attributes with many distinct values

**Gini Index** Measures the “impurity” of a dataset; it quantifies how mixed the classes are within the subsets after a split.

**Chi-Square** A statistical measure to test the independence of an attribute with respect to the target variable.

In [2]:

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.feature_selection import SelectKBest, chi2

# Load the wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Information Gain for feature selection
selector = SelectKBest(score_func=chi2, k=2)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Fit the classifier to the selected training data
clf.fit(X_train_selected, y_train)

# Make predictions on the selected test data
predictions = clf.predict(X_test_selected)

# Display the accuracy of the model
accuracy = clf.score(X_test_selected, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Display the decision tree rules for the selected features
selected_feature_names = [wine.feature_names[i] for i in selector.get_support(indices=True)]
tree_rules = export_text(clf, feature_names=selected_feature_names)
print("Decision Tree Rules for Selected Features:\n", tree_rules)


Accuracy: 0.86
Decision Tree Rules for Selected Features:
 |--- color_intensity <= 3.82
|   |--- proline <= 1002.50
|   |   |--- proline <= 790.00
|   |   |   |--- class: 1
|   |   |--- proline >  790.00
|   |   |   |--- color_intensity <= 3.46
|   |   |   |   |--- class: 1
|   |   |   |--- color_intensity >  3.46
|   |   |   |   |--- class: 0
|   |--- proline >  1002.50
|   |   |--- class: 0
|--- color_intensity >  3.82
|   |--- proline <= 755.00
|   |   |--- color_intensity <= 4.85
|   |   |   |--- proline <= 517.50
|   |   |   |   |--- class: 1
|   |   |   |--- proline >  517.50
|   |   |   |   |--- proline <= 645.00
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- proline >  645.00
|   |   |   |   |   |--- color_intensity <= 3.88
|   |   |   |   |   |   |--- class: 2
|   |   |   |   |   |--- color_intensity >  3.88
|   |   |   |   |   |   |--- proline <= 670.00
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- proline >  670.00
|   |   |   |   |   |   |  

## tree pruning
Pruning is a data compression technique in machine learning and search algorithms that reduces the size of decision trees by removing sections of the tree that are non-critical and redundant to classify instances.


In [3]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import GridSearchCV

# Load the Iris dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a decision tree classifier
clf = DecisionTreeClassifier()

# Define the hyperparameter grid for cost-complexity pruning
param_grid = {'ccp_alpha': [0.001, 0.002, 0.003, 0.004, 0.005]}

# Use GridSearchCV to find the best hyperparameter
grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best decision tree classifier
best_clf = grid_search.best_estimator_
print(best_clf)

# Make predictions on the test data
predictions = best_clf.predict(X_test)

# Display the accuracy of the pruned tree
accuracy = best_clf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

# Display the decision tree rules for the pruned tree
tree_rules = export_text(best_clf, feature_names=wine.feature_names)
print("Decision Tree Rules for Pruned Tree:\n", tree_rules)

DecisionTreeClassifier(ccp_alpha=0.001)
Accuracy: 0.94
Decision Tree Rules for Pruned Tree:
 |--- color_intensity <= 3.82
|   |--- proline <= 1002.50
|   |   |--- ash <= 3.07
|   |   |   |--- class: 1
|   |   |--- ash >  3.07
|   |   |   |--- class: 0
|   |--- proline >  1002.50
|   |   |--- class: 0
|--- color_intensity >  3.82
|   |--- flavanoids <= 1.40
|   |   |--- class: 2
|   |--- flavanoids >  1.40
|   |   |--- proline <= 724.50
|   |   |   |--- alcohol <= 13.14
|   |   |   |   |--- class: 1
|   |   |   |--- alcohol >  13.14
|   |   |   |   |--- class: 0
|   |   |--- proline >  724.50
|   |   |   |--- class: 0



## bayes theorem

In [4]:
%pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from ucimlrepo import fetch_ucirepo

# load spambase dataset
spambase = fetch_ucirepo(id=94)

X = spambase.data.features
y = spambase.data.targets
print(X.head())

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Naive Bayes classifier (MultinomialNB for discrete features)
clf = MultinomialNB()

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = clf.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f" \nAccuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

   word_freq_make  word_freq_address  word_freq_all  word_freq_3d  \
0            0.00               0.64           0.64           0.0   
1            0.21               0.28           0.50           0.0   
2            0.06               0.00           0.71           0.0   
3            0.00               0.00           0.00           0.0   
4            0.00               0.00           0.00           0.0   

   word_freq_our  word_freq_over  word_freq_remove  word_freq_internet  \
0           0.32            0.00              0.00                0.00   
1           0.14            0.28              0.21                0.07   
2           1.23            0.19              0.19                0.12   
3           0.63            0.00              0.31                0.63   
4           0.63            0.00              0.31                0.63   

   word_freq_order  word_freq_mail  ...  word_freq_conference  char_freq_;  \
0             0.00            0.00  ...                   0.0 

  y = column_or_1d(y, warn=True)


## Naïve Bayesian classification

In [6]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Load the 20 Newsgroups dataset
newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.2, random_state=42)

# Convert text data to numerical features using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Create a Naïve Bayes classifier (MultinomialNB for discrete features)
clf = MultinomialNB()

# Train the classifier on the TF-IDF transformed training data
clf.fit(X_train_tfidf, y_train)

# Make predictions on the TF-IDF transformed test data
predictions = clf.predict(X_test_tfidf)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions, target_names=newsgroups.target_names))


Accuracy: 0.66
Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.86      0.12      0.21       151
           comp.graphics       0.70      0.61      0.66       202
 comp.os.ms-windows.misc       0.67      0.61      0.64       195
comp.sys.ibm.pc.hardware       0.52      0.78      0.62       183
   comp.sys.mac.hardware       0.89      0.64      0.74       205
          comp.windows.x       0.89      0.81      0.85       215
            misc.forsale       0.86      0.60      0.71       193
               rec.autos       0.85      0.73      0.79       196
         rec.motorcycles       0.51      0.74      0.61       168
      rec.sport.baseball       0.96      0.77      0.86       211
        rec.sport.hockey       0.88      0.88      0.88       198
               sci.crypt       0.63      0.83      0.71       201
         sci.electronics       0.85      0.55      0.67       202
                 sci.med       0.88 

## k-nearest-neighbor classifiers

In [7]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a k-NN classifier with k=3
clf = KNeighborsClassifier(n_neighbors=3)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
predictions = clf.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions, target_names=wine.target_names))


Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

     class_0       0.86      0.86      0.86        14
     class_1       0.92      0.79      0.85        14
     class_2       0.60      0.75      0.67         8

    accuracy                           0.81        36
   macro avg       0.79      0.80      0.79        36
weighted avg       0.82      0.81      0.81        36



## Case-based reasoning
Case-based reasoning (CBR) is a problem-solving technique that uses past experiences to solve new problems. It's a machine learning technique that's similar to analogical processing and can be used in a variety of applications, including medicine, law, and computer algorithms

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import files

# Load the MovieLens dataset
uploaded = files.upload()
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
print(movies.head())
print(ratings.head())

# Merge movies and ratings data
movie_ratings = pd.merge(ratings, movies, on='movieId')
print(movie_ratings.head())

# Create a user-item matrix for collaborative filtering
user_item_matrix = movie_ratings.pivot_table(index='userId', columns='title', values='rating', fill_value=0)

# Use TF-IDF to convert movie titles into numerical features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['title'])

# Calculate cosine similarity between movie titles
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations using CBR
def get_movie_recommendations(movie_title):
    movie_indices = movies.index[movies['title'] == movie_title].tolist()
    if not movie_indices:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []
    movie_index = movie_indices[0]
    cosine_scores = list(enumerate(cosine_sim[movie_index]))
    cosine_scores = sorted(cosine_scores, key=lambda x: x[1], reverse=True)
    top_similar_movies = cosine_scores[1:6]  # 5 similar movies Exclude the input movie itself

    recommended_movies = []
    for index in top_similar_movies:
        recommended_movies.append(movies['title'].iloc[index])

    return recommended_movies

# Example usage
input_movie = "Toy Story (1995)"
recommendations = get_movie_recommendations(input_movie)

# Display the recommendations
print(f"\n\nMovies similar to '{input_movie}':")
for movie in recommendations:
    print("-", movie)

## Linear Regression

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

# California Housing dataset
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.DataFrame(housing.target, columns=['target'])
print(X.head())

# median income in block group feature => MedInc
X_feature = X[['MedInc']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_feature, y, test_size=0.2, random_state=42)

# Create a linear regression model
model = LinearRegression()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display the model coefficients and performance metrics
print("Coefficients:", model.coef_)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Plot the regression line
plt.scatter(X_test, y_test, color='black')
plt.plot(X_test, y_pred, color='blue', linewidth=3)
plt.xlabel('Average Number of Rooms (RM)')
plt.ylabel('House Price')
plt.title('Linear Regression: House Price Prediction')
plt.show()

## Perceptron: turning linear regression to classification
Perceptron is Machine Learning algorithm for supervised learning of various binary classification tasks.

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load the Wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Consider only the first two features for simplicity and binary classification
X = X[:, :2]

# Map wine classes to binary classes (setosa vs. non-setosa)
y_binary = (y == 0).astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.2, random_state=42)

# Create a perceptron classifier
perceptron = Perceptron()

# Train the perceptron on the training data
perceptron.fit(X_train, y_train)

# Make predictions on the test data
predictions = perceptron.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

# Plot the decision boundary
plt.scatter(X[:, 0], X[:, 1], c=y_binary, cmap=plt.cm.Paired, edgecolors='k')
plt.xlabel('Sepal Length (cm)')
plt.ylabel('Sepal Width (cm)')
plt.title('Perceptron: Iris Binary Classification')
plt.show()

## Logistic regression

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import files

# Load the Titanic dataset
uploaded = files.upload()
titanic = pd.read_csv('titanic.csv')

# Drop rows with missing values and select relevant features
titanic = titanic.dropna(subset=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived'])
X = titanic[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = titanic['Survived']

# Convert categorical features to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['Sex', 'Embarked'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a logistic regression model
logreg = LogisticRegression()

# Train the model on the training data
logreg.fit(X_train, y_train)

# Make predictions on the test data
predictions = logreg.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))

# Plot the confusion matrix
cm = pd.crosstab(y_test, predictions, rownames=['Actual'], colnames=['Predicted'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.show()

## Introducing ensemble methods
Ensemble methods are machine learning algorithms that combine multiple predictive models to improve the accuracy and stability of predictions.
like Random forrest which is combining of multiple decision tree classifiers

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Wine dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_classifier.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))


## Bagging
a machine learning technique that uses multiple models to improve the accuracy and stability of predictions

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier with 20 trees
rf_classifier = RandomForestClassifier(n_estimators=20, random_state=42)

# Train the classifier on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_classifier.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))


## Boosting
Boosting is a machine learning technique that improves the accuracy of predictive models by combining multiple weak learners into a single strong learner:

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Iris dataset
wine = datasets.load_wine()
X = wine.data
y = wine.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an AdaBoost classifier with 20 weak learners (Decision Trees)
adaboost_classifier = AdaBoostClassifier(n_estimators=20, random_state=42)

# Train the classifier on the training data
adaboost_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = adaboost_classifier.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))


## Random forests

In [None]:
# Import necessary libraries
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer Wisconsin dataset
cancer = datasets.load_breast_cancer()
X = cancer.data
y = cancer.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest classifier with 100 trees
random_forest_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier on the training data
random_forest_classifier.fit(X_train, y_train)

# Make predictions on the test data
predictions = random_forest_classifier.predict(X_test)

# Display the accuracy and classification report
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, predictions))
