In [None]:
# Part 1: EDA and Prediction without Pipeline

# TODO: Import necessary libraries (e.g., pandas, numpy, matplotlib, sklearn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier  # Replace with your chosen classifier

# TODO: Load the dataset (replace 'your_dataset.csv' with the actual dataset file)
data = pd.read_csv('your_dataset.csv')  # Replace with your dataset

# TODO: Explore the dataset with basic EDA (e.g., data.head(), data.info(), data.describe())
print(data.head())
print(data.info())
print(data.describe())

# TODO: Prepare the features (X) and target (y)
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# TODO: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TODO: Build and train a machine learning model (e.g., Decision Tree)
model = DecisionTreeClassifier()  # Replace with your chosen classifier
model.fit(X_train, y_train)

# TODO: Make predictions on the testing set
y_pred = model.predict(X_test)

# TODO: Evaluate the model's performance (e.g., accuracy, confusion matrix)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion)

# TODO: Visualize the results (if applicable)
# You can add visualization code here
# Visualization: Bar chart of target variable distribution
plt.figure(figsize=(8, 6))
data['target_column'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Distribution of Target Variable')
plt.xlabel('Target Classes')
plt.ylabel('Count')
plt.xticks(rotation=0)  # Rotate x-axis labels if needed
plt.show()

# End of Part 1


In [None]:
# Part 2: EDA and kNN Prediction in a Pipeline

# TODO: Import necessary libraries (e.g., pandas, numpy, matplotlib, sklearn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  # Replace with your chosen preprocessing steps
from sklearn.neighbors import KNeighborsClassifier  # Replace with your chosen classifier
from sklearn.pipeline import Pipeline

# TODO: Load the dataset (replace 'your_dataset.csv' with the actual dataset file)
data = pd.read_csv('your_dataset.csv')  # Replace with your dataset

# TODO: Prepare the features (X) and target (y)
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# TODO: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TODO: Create a data preprocessing pipeline (e.g., scaling, handling missing values)
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Replace with your preprocessing steps
])

# TODO: Build a kNN classifier as part of the pipeline
knn_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', KNeighborsClassifier()),  # Replace with your chosen classifier
])

# TODO: Train the pipeline on the training data
knn_pipeline.fit(X_train, y_train)

# TODO: Make predictions on the testing set using the pipeline
y_pred = knn_pipeline.predict(X_test)

# TODO: Evaluate the pipeline's performance (e.g., accuracy, confusion matrix)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion)

# TODO: Visualize the results (if applicable)
# You can add visualization code here

# End of Part 2


In [None]:
# Part 3: Combine Pipeline and GridSearch for Hyperparameter Tuning

# TODO: Import necessary libraries (e.g., pandas, numpy, matplotlib, sklearn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler  # Replace with your chosen preprocessing steps
from sklearn.neighbors import KNeighborsClassifier  # Replace with your chosen classifier
from sklearn.pipeline import Pipeline

# TODO: Load the dataset (replace 'your_dataset.csv' with the actual dataset file)
data = pd.read_csv('your_dataset.csv')  # Replace with your dataset

# TODO: Prepare the features (X) and target (y)
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# TODO: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TODO: Create a data preprocessing pipeline (e.g., scaling, handling missing values)
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Replace with your preprocessing steps
])

# TODO: Build a kNN classifier as part of the pipeline
knn_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', KNeighborsClassifier()),  # Replace with your chosen classifier
])

# TODO: Define hyperparameter grid for GridSearch (e.g., K values, weights)
param_grid = {
    'classifier__n_neighbors': [3, 5, 7],  # Replace with K values to test
    'classifier__weights': ['uniform', 'distance'],  # Replace with weight options to test
}

# TODO: Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# TODO: Train the pipeline with the best hyperparameters on the training data
best_knn_pipeline = grid_search.best_estimator_
best_knn_pipeline.fit(X_train, y_train)

# TODO: Make predictions on the testing set using the optimized pipeline
y_pred = best_knn_pipeline.predict(X_test)

# TODO: Evaluate the optimized pipeline's performance (e.g., accuracy, confusion matrix)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion)

# TODO: Visualize the results (if applicable)
# You can add visualization code here

# End of Part 3


In [None]:
# Part 4: Save and Load Model (Optional)

# TODO: Import necessary libraries (e.g., joblib)
import joblib

# Assuming you have already performed Part 3 and have the 'best_knn_pipeline' model

# TODO: Save the optimized model to a file (e.g., 'knn_model.pkl')
model_filename = 'knn_model.pkl'
joblib.dump(best_knn_pipeline, model_filename)

# TODO: Load the saved model from the file
loaded_model = joblib.load(model_filename)

# TODO: Use the loaded model to make predictions (e.g., on new data)
new_data = pd.DataFrame(...)  # Replace with your new data for prediction
new_predictions = loaded_model.predict(new_data)

# You can now use 'new_predictions' for further analysis

# End of Part 4 (Optional)


In [None]:
# Full AML Lab Sessional

# TODO: Import necessary libraries (e.g., pandas, numpy, matplotlib, sklearn)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler  # Replace with your chosen preprocessing steps
from sklearn.neighbors import KNeighborsClassifier  # Replace with your chosen classifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
import joblib

# TODO: Load the dataset (replace 'your_dataset.csv' with the actual dataset file)
data = pd.read_csv('your_dataset.csv')  # Replace with your dataset

# TODO: Prepare the features (X) and target (y)
X = data.drop('target_column', axis=1)  # Replace 'target_column' with the actual target column name
y = data['target_column']

# TODO: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TODO: Create a data preprocessing pipeline (e.g., scaling, handling missing values)
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Replace with your preprocessing steps
])

# TODO: Build a kNN classifier as part of the pipeline
knn_pipeline = Pipeline([
    ('preprocessing', preprocessing_pipeline),
    ('classifier', KNeighborsClassifier()),  # Replace with your chosen classifier
])

# TODO: Define hyperparameter grid for GridSearch (e.g., K values, weights)
param_grid = {
    'classifier__n_neighbors': [3, 5, 7],  # Replace with K values to test
    'classifier__weights': ['uniform', 'distance'],  # Replace with weight options to test
}

# TODO: Perform GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# TODO: Train the pipeline with the best hyperparameters on the training data
best_knn_pipeline = grid_search.best_estimator_
best_knn_pipeline.fit(X_train, y_train)

# TODO: Make predictions on the testing set using the optimized pipeline
y_pred = best_knn_pipeline.predict(X_test)

# TODO: Evaluate the optimized pipeline's performance (e.g., accuracy, confusion matrix)
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(confusion)

# TODO: Visualize the results (if applicable)
# You can add visualization code here

# TODO: Save the optimized model to a file (e.g., 'knn_model.pkl')
model_filename = 'knn_model.pkl'
joblib.dump(best_knn_pipeline, model_filename)

# TODO: Load the saved model from the file
loaded_model = joblib.load(model_filename)

# TODO: Use the loaded model to make predictions (e.g., on new data)
new_data = pd.DataFrame(...)  # Replace with your new data for prediction
new_predictions = loaded_model.predict(new_data)

# End of AML Lab Sessional
