In [10]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import joblib
from google.colab import drive
from sklearn.metrics import accuracy_score, classification_report

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# Path to the dataset in Google Drive
dataset_path = '/content/drive/My Drive/Scalable_ML_Model/train.csv'

# Load dataset
df = pd.read_csv(dataset_path)

# Check for missing values
print("Missing values per column:\n", df.isnull().sum())

# Handle missing values
# For numeric columns, fill with mean
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())

# For non-numeric columns, fill with mode
non_numeric_cols = df.select_dtypes(exclude=['number']).columns
for col in non_numeric_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

# Verify no missing values remain
print("Missing values after preprocessing:\n", df.isnull().sum())

# Preprocessing
X = df.drop("target", axis=1)  # Features
y = df["target"]              # Target

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save the model
model_save_path = '/content/drive/My Drive/Scalable_ML_Model/model.pkl'
joblib.dump(model, model_save_path)

print("Model trained and saved!")

Mounted at /content/drive
Missing values per column:
 feature1    0
feature2    0
feature3    0
feature4    0
target      0
dtype: int64
Missing values after preprocessing:
 feature1    0
feature2    0
feature3    0
feature4    0
target      0
dtype: int64
Model trained and saved!


In [11]:
import joblib
from sklearn.metrics import accuracy_score, classification_report

# Load the saved model
loaded_model = joblib.load('/content/drive/My Drive/Scalable_ML_Model/model.pkl')

# Make predictions on the test set
y_pred = loaded_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.15

Classification Report:
               precision    recall  f1-score   support

      setosa       0.12      0.12      0.12         8
  versicolor       0.00      0.00      0.00         5
   virginica       0.33      0.29      0.31         7

    accuracy                           0.15        20
   macro avg       0.15      0.14      0.14        20
weighted avg       0.17      0.15      0.16        20



In [21]:
from sklearn.model_selection import cross_val_score

# Cross-validation on the training set
cv_scores = cross_val_score(RandomForestClassifier(), X_train, y_train, cv=4)
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.2f}")

Cross-Validation Scores: [0.4  0.3  0.2  0.45]
Mean CV Accuracy: 0.34


In [22]:
# Check model parameters
print("Model Parameters:\n", loaded_model.get_params())

# Check feature importances (for Random Forest)
if hasattr(loaded_model, "feature_importances_"):
    feature_importances = loaded_model.feature_importances_
    print("\nFeature Importances:", feature_importances)

Model Parameters:
 {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Feature Importances: [0.23033243 0.23996877 0.24225054 0.28744826]


In [23]:
# Example input for prediction
sample_input = X_test.iloc[:5]  # First 5 rows from the test set
predictions = loaded_model.predict(sample_input)
print("Sample Predictions:", predictions)

Sample Predictions: ['setosa' 'setosa' 'versicolor' 'virginica' 'virginica']


In [24]:
from sklearn.dummy import DummyClassifier

# Train a baseline model
baseline = DummyClassifier(strategy="most_frequent")
baseline.fit(X_train, y_train)
baseline_accuracy = baseline.score(X_test, y_test)
print(f"Baseline Accuracy: {baseline_accuracy:.2f}")

Baseline Accuracy: 0.40
