# Data Mining Tasks

# Predictive Task


## Classification

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Load iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Use the trained classifier to make predictions on the test set
y_pred = clf.predict(X_test)
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

## Regression

In [23]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

# Load Boston housing dataset
housing = fetch_california_housing()
X = housing.data
y = housing.target

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
reg = LinearRegression()
reg.fit(X_train, y_train)

# Use the trained regressor to make predictions on the test set
y_pred = reg.predict(X_test)
print("y_pred:", y_pred)

# Compute and print the root mean square error
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error (RMSE): {rmse}")


y_pred: [0.71912284 1.76401657 2.70965883 ... 4.46877017 1.18751119 2.00940251]
Root Mean Squared Error (RMSE): 0.7455813830127751


# Clustering

In [3]:

from sklearn.cluster import KMeans

# Use iris dataset
X = iris.data

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)

# Predict the cluster labels
labels = kmeans.predict(X)
labels



array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 2,
       2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0])

# Association Analysis

In [18]:
#!pip install mlxtend

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder

# list of transactions where each transaction is a list of items
data = [
    ['milk', 'onion', 'nutmeg', 'kidney beans', 'eggs', 'yogurt'],
    ['dill', 'onion', 'nutmeg', 'kidney beans', 'eggs', 'yogurt'],
    ['milk', 'apple', 'kidney beans', 'eggs'],
    ['milk', 'unicorn', 'corn', 'kidney beans', 'yogurt'],
    ['corn', 'onion', 'onion', 'kidney beans', 'ice cream', 'eggs']
]

te = TransactionEncoder()
te_ary = te.fit(data).transform(data)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Compute all frequent itemsets
frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)

# Generate all rules with lift > 1.2
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=.75)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(eggs),(kidney beans),0.8,1.0,0.8,1.0,1.0,0.0,inf
1,(kidney beans),(eggs),1.0,0.8,0.8,0.8,1.0,0.0,1.0
2,(onion),(eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
3,(milk),(kidney beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
4,(onion),(kidney beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
5,(yogurt),(kidney beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
6,"(onion, eggs)",(kidney beans),0.6,1.0,0.6,1.0,1.0,0.0,inf
7,"(onion, kidney beans)",(eggs),0.6,0.8,0.6,1.0,1.25,0.12,inf
8,(onion),"(eggs, kidney beans)",0.6,0.8,0.6,1.0,1.25,0.12,inf
