In [1]:
import pandas as pd
from sklearn.linear_model import Perceptron
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score

# Function to load dataset
def load_dataset(file_path):
    return pd.read_csv(file_path)

# Function to split features and target
def split_features_and_target(df, target_col=-2):
    X = df.iloc[:, :-2]  # All columns except the last two
    y = df.iloc[:, target_col]  # Choose one target column
    return X, y

# Function to split data into training and testing sets
def split_train_test(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to perform RandomizedSearchCV on a Perceptron
def perform_random_search(X_train, y_train):
    perceptron = Perceptron()

    # Hyperparameter grid
    param_dist = {
        'penalty': [None, 'l2', 'l1', 'elasticnet'],
        'alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1],
        'max_iter': [100, 500, 1000, 2000],
        'tol': [1e-3, 1e-4, 1e-5],
    }

    random_search = RandomizedSearchCV(perceptron, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)
    random_search.fit(X_train, y_train)

    return random_search

# Function to evaluate model on test data
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Main function
def main():
    # Load dataset
    file_path = r"C:\Users\Gurram Bhavya Reddy\OneDrive\Desktop\ML lab\10 - Java_AST.csv"
    df = load_dataset(file_path)

    # Split features and target (choose one target column, e.g., -2 or -1)
    X, y = split_features_and_target(df, target_col=-2)

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Perform RandomizedSearchCV on Perceptron
    best_model = perform_random_search(X_train, y_train)

    # Print best parameters and cross-validation score
    print(f"Best parameters for Perceptron: {best_model.best_params_}")
    print(f"Best cross-validation score: {best_model.best_score_}")

    # Evaluate on unseen test data
    test_accuracy = evaluate_model(best_model, X_test, y_test)
    print(f"Accuracy on test data: {test_accuracy}")

# Run the main function
if __name__ == "__main__":
    main()




Best parameters for Perceptron: {'tol': 0.001, 'penalty': None, 'max_iter': 1000, 'alpha': 0.0001}
Best cross-validation score: 0.19593908629441625
Accuracy on test data: 0.22672064777327935


In [None]:
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Function to load dataset
def load_dataset(file_path):
    return pd.read_csv(file_path)

# Function to split features and target
def split_features_and_target(df, target_col=-1):
    X = df.iloc[:, :-2]  # All columns except the last two
    y = df.iloc[:, target_col]  # Choose one of the target columns
    return X, y

# Function to split data into training and testing sets
def split_train_test(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to create the MLP pipeline
def create_pipeline():
    return Pipeline([
        ('scaler', StandardScaler()),  # Feature scaling
        ('mlp', MLPClassifier(max_iter=2000))  # Increased max_iter for better convergence
    ])

# Function to perform RandomizedSearchCV
def perform_random_search(X_train, y_train, pipeline):
    # Hyperparameter grid
    param_dist = {
        'mlp__hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
        'mlp__activation': ['tanh', 'relu'],
        'mlp__solver': ['adam', 'sgd'],
        'mlp__alpha': [1e-4, 1e-3, 1e-2],
        'mlp__learning_rate': ['constant', 'adaptive'],
    }

    # StratifiedKFold for better class distribution in CV
    cv = StratifiedKFold(n_splits=5)

    # RandomizedSearchCV for MLP
    random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=10, cv=cv, random_state=42)
    random_search.fit(X_train, y_train)

    return random_search

# Function to evaluate model on test data
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy

# Main function
def main():
    # Load dataset
    file_path = r"C:\Users\Gurram Bhavya Reddy\OneDrive\Desktop\ML lab\10 - Java_AST.csv"
    df = load_dataset(file_path)

    # Split features and target (choose one target column, e.g., -2 or -1)
    X, y = split_features_and_target(df, target_col=-1)

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = split_train_test(X, y)

    # Create MLP pipeline
    pipeline = create_pipeline()

    # Perform RandomizedSearchCV on MLP
    best_model = perform_random_search(X_train, y_train, pipeline)

    # Print best parameters and cross-validation score
    print(f"Best parameters for MLP: {best_model.best_params_}")
    print(f"Best cross-validation score: {best_model.best_score_}")

    # Evaluate on unseen test data
    test_accuracy = evaluate_model(best_model, X_test, y_test)
    print(f"Accuracy on test data: {test_accuracy}")

# Run the main function
if __name__ == "__main__":
    main()


In [None]:
pip install catboost xgboost

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB

# Function to load the dataset and remap target class labels
def load_and_prepare_data(file_path):
    df = pd.read_csv(file_path)
    
    # Step 1: Sequentially remap the target class labels
    unique_classes = df.iloc[:, -1].unique()  # Assuming the last column is the target column
    class_mapping = {old_label: new_label for new_label, old_label in enumerate(sorted(unique_classes))}
    df.iloc[:, -1] = df.iloc[:, -1].map(class_mapping)
    
    print("Mapped Class Labels:", df.iloc[:, -1].unique())
    
    # Step 2: Split features (X) and target (y)
    X = df.iloc[:, :-2]  # All columns except the last two
    y = df.iloc[:, -1]  # The remapped target column
    
    return X, y

# Function to split dataset into train and test sets
def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

# Function to scale the data
def scale_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled

# Function to define classifiers
def get_classifiers():
    return {
        'SVM': SVC(probability=True, random_state=42),
        'Decision Tree': DecisionTreeClassifier(random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'XGBoost': XGBClassifier(random_state=42),
        'CatBoost': CatBoostClassifier(verbose=0, random_state=42),
        'Naive Bayes': GaussianNB()
    }

# Function to evaluate a classifier and return the performance metrics
def evaluate_classifier(clf, clf_name, X_train_scaled, X_test_scaled, y_train, y_test):
    print(f"Training {clf_name}...")
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    # Predict probabilities for ROC AUC
    if hasattr(clf, 'predict_proba'):
        if len(y_train.unique()) == 2:  # Binary classification
            y_prob = clf.predict_proba(X_test_scaled)[:, 1]
        else:  # Multiclass classification
            y_prob = clf.predict_proba(X_test_scaled)
    else:
        y_prob = None

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Handle ROC AUC for binary and multiclass cases
    if y_prob is not None:
        if len(y_train.unique()) == 2:
            roc_auc = roc_auc_score(y_test, y_prob)
        else:
            roc_auc = roc_auc_score(y_test, y_prob, multi_class='ovr')
    else:
        roc_auc = None
    
    return {
        'Classifier': clf_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'ROC AUC': roc_auc
    }

# Main function to run the classification and evaluation
def main():
    # Load the dataset and prepare data
    file_path = r"C:\Users\Gurram Bhavya Reddy\OneDrive\Desktop\ML lab\10 - Java_AST.csv"
    X, y = load_and_prepare_data(file_path)
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # Scale the features
    X_train_scaled, X_test_scaled = scale_data(X_train, X_test)
    
    # Get the classifiers
    classifiers = get_classifiers()
    
    # Store results
    results = []
    
    # Evaluate each classifier
    for clf_name, clf in classifiers.items():
        result = evaluate_classifier(clf, clf_name, X_train_scaled, X_test_scaled, y_train, y_test)
        results.append(result)
    
    # Convert results to a DataFrame and display
    results_df = pd.DataFrame(results)
    print(results_df)

# Run the main function
if __name__ == "__main__":
    main()