In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns 
import matplotlib.pyplot as plt

# plan
1. Data Preprocessing 
    - encoding Social_Responsiveness_Scale scaling
    ,Speech Delay/Language Disorder, Learning disorder, Genetic_Disorders,
    Depression, Global developmental delay/intellectual disability, Social/Behavioural Issues,
    Childhood Autism Rating Scale, Anxiety_disorder, Sex, Ethnicity, Jaundice, Family_mem_with_ASD,
    Who_completed_the_test, ASD_traits
    
    - Null values handling on Social_Responsiveness_Scale,Qchat_10_Score, Depression, Social/Behavioural Issues
    
2. Enter the ML phase and make predictions
3. select the best model
4. XAI
    

In [None]:
#Read the Dataset
df = pd.read_csv('data_csv.csv')
df.head()



In [None]:
#Apply Pandas Profiling
from ydata_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_file(output_file = 'ASD.html')

In [None]:
import os
from pathlib import Path

IMAGES_PATH = Path() / "images" / "Data Preprocessing"
IMAGES_PATH.mkdir(parents=True, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = IMAGES_PATH / f"{fig_id}.{fig_extension}"
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)


In [None]:
import matplotlib.pyplot as plt

# extra code – the next 5 lines define the default font sizes
plt.rc('font', size=14)
plt.rc('axes', labelsize=14, titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=10)
plt.rc('ytick', labelsize=10)

df.hist(bins=50, figsize=(12, 8))
save_fig("attribute_histogram_plots")  # extra code
plt.show()


In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
# Define the categorical columns you want to label encode
categorical_columns = ['Speech Delay/Language Disorder', 'Learning disorder', 'Genetic_Disorders', 'Depression', 
                       'Global developmental delay/intellectual disability',
                       'Social/Behavioural Issues', 'Childhood Autism Rating Scale', 'Anxiety_disorder', 
                       'Sex', 'Ethnicity', 'Jaundice', 'Family_mem_with_ASD', 'Who_completed_the_test', 'ASD_traits']

for column in categorical_columns:
    labelencoder = LabelEncoder()
    df[column] = labelencoder.fit_transform(df[column])


In [None]:
df.head(1)

In [None]:
#handling null values
df.isnull().sum()

In [None]:
df.dropna(axis=0,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.head(5)

# Preprocessed Dataset Download

In [None]:
df.to_csv(index=False)

In [None]:
compression_opts = dict(method='zip',
                        archive_name='preprocessed.csv')  
df.to_csv('out.zip', index=False,
          compression=compression_opts)  

In [None]:
from pathlib import Path

folder_path = Path('C:/Shanila/CSE/cse445/final_project/git_project/Machine_Learning_Project/sha')
filepath = folder_path / 'preprocessed.csv'

folder_path.mkdir(parents=True, exist_ok=True)
df.to_csv(filepath)


In [None]:
>>> import os  
>>> os.makedirs('folder/subfolder', exist_ok=True)  
>>> df.to_csv('folder/subfolder/out.csv')  

In [None]:
import os

folder_path = 'C:/Shanila/CSE/cse445/final_project/git_project/Machine_Learning_Project/sha'
os.makedirs(folder_path, exist_ok=True)

df.to_csv(os.path.join(folder_path, 'preprocessed.csv'))


# Entering the ML Phase

In [None]:
X = df.drop(columns=["Learning disorder", "CASE_NO_PATIENT'S"], axis=1)
y= df['Learning disorder']


# Splitting Train and Test set in 80-20 ratio

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)

In [None]:
# SAVE TRAIN SET AND TEST SET

import pandas as pd
import os

# Specify the folder paths C:/Shanila/CSE/cse445/final_project/git_project/Machine_Learning_Project/sha
train_folder = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\train"
test_folder = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\test"

# Create the folders if they don't exist
os.makedirs(train_folder, exist_ok=True)
os.makedirs(test_folder, exist_ok=True)

# Save X_train and y_train as CSV files in the "train" folder
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv(os.path.join(train_folder, "train_data.csv"), index=False)

# Save X_test and y_test as CSV files in the "test" folder
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv(os.path.join(test_folder, "test_data.csv"), index=False)


In [None]:
#Importing ML Algorithms to test

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [None]:
#from sklearn.metrics.plot import confusion_matrix
#import error

In [None]:
from sklearn.metrics import mean_squared_error

def Results(clf):
    print(clf)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print('Train Accuracy', accuracy_score(y_train, clf.predict(X_train)))
    print('Test Accuracy', accuracy_score(y_test, predictions))
    print(confusion_matrix(y_test, predictions))
    print(classification_report(y_test, predictions))
    #also pretty confusion matrix
    print("-" * 60)
    

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5)
knn2 = KNeighborsClassifier(n_neighbors = 15)
dt1 = DecisionTreeClassifier() #Gini
dt2 = DecisionTreeClassifier(criterion = 'entropy') #entropy
rf = RandomForestClassifier()
nb = GaussianNB()
lr = LinearRegression()
lor = LogisticRegression()
dummy = DummyClassifier(strategy = "most_frequent")
dummy.fit(X, y)

classifiers = [dummy,  dt1, dt2, rf, knn, knn2, nb, lr, lor]


for clf in classifiers:
    if clf in [lr, lor]:
        # For Linear Regression and Logistic Regression
        print(clf)
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        print('Train Accuracy', clf.score(X_train, y_train))
        print('Test Accuracy', clf.score(X_test, y_test))
        #print('Log Loss:', log_loss(y_test, clf.predict_proba(X_test)))
        # Additional regression-specific evaluation metrics can be added here
        print("-" * 60)
    else:
        Results(clf)
    
    

In [None]:
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error

def Results(clf, is_regression=False):
    print(clf)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    
    if is_regression:
        # Regression-specific evaluation metrics
        train_accuracy = clf.score(X_train, y_train)
        test_accuracy = clf.score(X_test, y_test)
        mse = mean_squared_error(y_test, predictions)
        print(f'Train Accuracy: {train_accuracy:.2f}')
        print(f'Test Accuracy: {test_accuracy:.2f}')
        print(f'Mean Squared Error: {mse:.2f}')
    else:
        # Classification-specific evaluation metrics
        train_accuracy = accuracy_score(y_train, clf.predict(X_train))
        test_accuracy = accuracy_score(y_test, predictions)
        classification_report_str = classification_report(y_test, predictions)
        print(f'Train Accuracy: {train_accuracy:.2f}')
        print(f'Test Accuracy: {test_accuracy:.2f}')
        print('Classification Report:\n', classification_report_str)
    
    print("-" * 60)

knn = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=15)
dt1 = DecisionTreeClassifier()  # Gini
dt2 = DecisionTreeClassifier(criterion='entropy')  # entropy
rf = RandomForestClassifier()
nb = GaussianNB()
lr = LinearRegression()
lor = LogisticRegression()
dummy = DummyClassifier(strategy="most_frequent")
dummy.fit(X, y)

classifiers = [dummy, dt1, dt2, rf, knn, knn2, nb, lr, lor]

for clf in classifiers:
    is_regression = clf in [lr, lor]
    Results(clf, is_regression)


In [None]:
# # save result in result seperately
# also has csv, doc and images png


import pandas as pd
import os
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, mean_squared_error
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

# Define a function to save the results to a text (TXT) file and as a PNG image
def save_results_to_files(clf, txt_file_path, png_file_path, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)

    if hasattr(clf, 'predict_proba'):
        # Classification model
        train_accuracy = accuracy_score(y_train, clf.predict(X_train))
        test_accuracy = accuracy_score(y_test, predictions)
        confusion = confusion_matrix(y_test, predictions)
        report = classification_report(y_test, predictions, output_dict=True)

        # Save results to a text (TXT) file
        with open(txt_file_path, 'w') as file:
            file.write(f"Train Accuracy: {train_accuracy*100:.4f}%\n")
            file.write(f"Test Accuracy: {test_accuracy*100:.4f}%\n")
            file.write("Confusion Matrix:\n")
            file.write(str(confusion) + "\n")
            file.write("Classification Report:\n")

            # Split the classification report into lines
            report_lines = classification_report(y_test, predictions).split('\n')
            for line in report_lines:
                # Check if the line is not empty
                if line:
                    # Split the line into words
                    words = line.split()
                    # Format precision, recall, f1-score, and support as percentages
                    formatted_line = ' '.join([f'{float(word)*100:.4f}%' if '%' in word else word for word in words])
                    file.write(formatted_line + '\n')

    else:
        # Regression model
        train_mse = mean_squared_error(y_train, clf.predict(X_train))
        test_mse = mean_squared_error(y_test, predictions)

        # Save results to a text (TXT) file
        with open(txt_file_path, 'w') as file:
            file.write(f"Train Mean Squared Error: {train_mse:.4f}\n")
            file.write(f"Test Mean Squared Error: {test_mse:.4f}\n")

    # Save the confusion matrix as a PNG image for classification models
    if hasattr(clf, 'predict_proba'):
        save_confusion_matrix_as_png(confusion, clf, png_file_path)

# Function to save the confusion matrix as a PNG image for classification models
def save_confusion_matrix_as_png(matrix, clf, png_file_path):
    # Create a custom color map with specified colors
    colors = ['#B1BCE6', '#B2C8DF', '#C4D7E0', '#EFEFEF']
    cmap = LinearSegmentedColormap.from_list('Custom', colors, N=matrix.max() + 1)

    # Plot the confusion matrix with the custom color map
    plt.figure(figsize=(6, 6))
    plt.imshow(matrix, interpolation='nearest', cmap=cmap)
    plt.colorbar()

    # Add labels to the confusion matrix
    for i in range(matrix.shape[0]):
        for j in range(matrix.shape[1]):
            plt.text(j, i, matrix[i, j], horizontalalignment='center', verticalalignment='center', fontsize=12, color='black')

    plt.title(f'Confusion Matrix - {clf}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')

    # Save the confusion matrix as a PNG image
    plt.savefig(png_file_path, bbox_inches='tight', dpi=100)
    plt.close()

# Specify the directory for saving the result files
result_directory = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\result\\"

# List of classifiers to evaluate
classifiers = [dummy, dt1, dt2, rf, knn, knn2, nb, lr, lor]

for clf in classifiers:
    # Generate file names based on the classifier's name
    classifier_name = clf.__class__.__name__
    txt_file_path = os.path.join(result_directory, f"{classifier_name}_results.txt")
    png_file_path = os.path.join(result_directory, f"{classifier_name}_confusion_matrix.png")

    # Call the function to save the results and pass X_train, X_test, y_train, and y_test
    save_results_to_files(clf, txt_file_path, png_file_path, X_train, X_test, y_train, y_test)


In [None]:
# bar chart for test and train set

import matplotlib.pyplot as plt
import numpy as np

# List of classifiers (excluding Linear Regression)
classifiers = [dummy, dt1, dt2, rf, knn, knn2, nb, lor]

# Train and test accuracies for each classifier
train_accuracies = [accuracy_score(y_train, clf.predict(X_train)) for clf in classifiers]
test_accuracies = [accuracy_score(y_test, clf.predict(X_test)) for clf in classifiers]

# Bar positions
positions = np.arange(len(classifiers))

# Bar height
bar_height = 0.35

# Bar colors
train_colors = ['#7895B2'] * len(classifiers)
test_colors = ['#D2DAFF'] * len(classifiers)

# Create the horizontal bar chart for train set accuracy
plt.figure(figsize=(10, 6))
plt.barh(positions - bar_height/2, train_accuracies, bar_height, color=train_colors, edgecolor='black', linewidth=0.5, label='Train Set Accuracy')

# Create the horizontal bar chart for test set accuracy
plt.barh(positions + bar_height/2, test_accuracies, bar_height, color=test_colors, edgecolor='black', linewidth=0.5, label='Test Set Accuracy')

# Add accuracy percentages on top of each bar
for i in range(len(classifiers)):
    plt.text(train_accuracies[i] + 0.005, i - bar_height / 2, f'{train_accuracies[i]*100:.2f}%', color='black', ha='left')
    plt.text(test_accuracies[i] + 0.005, i + bar_height / 2, f'{test_accuracies[i]*100:.2f}%', color='black', ha='left')

# Set the y-axis labels
labels = ['Dummy', 'Decision Tree (Gini)', 'Decision Tree (Entropy)', 'Random Forest', 'K-Nearest Neighbors (k=5)', 'K-Nearest Neighbors (k=15)', 'Naive Bayes', 'Logistic Regression']
plt.yticks(positions, labels)

# Set the x-axis label
plt.xlabel('Accuracy')

# Set the chart title
plt.title('Classifier Train and Test Set Accuracy')

# Set x-axis limits to provide more space for the labels
plt.xlim(0, 1.3)

# Show the legend
plt.legend(loc='best')

# Display the plot
plt.tight_layout()

# Save the plot to the specified destination in high definition
plot_save_path = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\result2\\classifier_accuracy_horizontal.png"
plt.savefig(plot_save_path, bbox_inches='tight', dpi=300)
plt.show()


In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd

# Load the training and testing data from CSV files
train_data = pd.read_csv("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\train\\train_data.csv")
test_data = pd.read_csv("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\test\\test_data.csv")

# Separate features (X) and target (y) for both training and testing data
X_train = train_data.drop(columns=["Learning disorder"])
y_train = train_data["Learning disorder"]
X_test = test_data.drop(columns=["Learning disorder"])
y_test = test_data["Learning disorder"]

# Create a dictionary to store results
results = {}

# Initialize and evaluate different classifiers
classifiers = {
    "Dummy Classifier": DummyClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": GaussianNB()
}

for name, classifier in classifiers.items():
    # Fit the classifier on the training data
    classifier.fit(X_train, y_train)
    
    # Make predictions on the testing data
    predictions = classifier.predict(X_test)
    
    # Evaluate the classifier and store the results
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions)
    matrix = confusion_matrix(y_test, predictions)
    
    results[name] = {
        "Accuracy": accuracy,
        "Classification Report": report,
        "Confusion Matrix": matrix
    }

# Create a DataFrame from the results dictionary
results_df = pd.DataFrame(results)

# Specify the path to save the results as a CSV file
results_csv_path = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_project\\sha\\results.csv"

# Save the results as a CSV file
results_df.to_csv(results_csv_path, index=False)

# Optionally, you can also display the results DataFrame
print(results_df)


charts

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Load the results from the CSV file
results_df = pd.read_csv("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\results.csv")

# Define the classifiers and their corresponding accuracies
classifiers = results_df.columns
accuracies = results_df.loc[0].values.astype(float)  # Convert the values to float

# Create a horizontal bar chart
plt.figure(figsize=(10, 6))

# Create a bar plot with labels
bar_positions = np.arange(len(classifiers))
bar_colors = ['#7895B2', '#D2DAFF', '#B2C8DF', '#B1BCE6', '#EFEFEF', '#C4D7E0']
plt.barh(bar_positions, accuracies, color=bar_colors, edgecolor='black', linewidth=0.5)

# Add accuracy percentages on the bars
for i, acc in enumerate(accuracies):
    plt.text(acc + 0.005, i, f'{acc * 100:.2f}%', color='black', va='center')

# Set the y-axis labels
plt.yticks(bar_positions, classifiers)

# Set the x-axis label
plt.xlabel('Accuracy')

# Set the chart title
plt.title('Classifier Accuracies')

# Set x-axis limits to provide more space for the labels
plt.xlim(0, 1.3)

# Save the bar chart as a PNG
bar_chart_save_path = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result2\\bar_chart.png"
plt.savefig(bar_chart_save_path, bbox_inches='tight', dpi=300)
plt.show()


table NEEDS UPDATING RESULT2.XLSX AND RUN

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the data from the Excel file
result2_df = pd.read_excel("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result2.xlsx")

# Create a figure and axis
fig, ax = plt.subplots(figsize=(12, 6))

# Create a table with the data from the DataFrame
table_data = [result2_df.columns] + result2_df.values.tolist()
table = ax.table(cellText=table_data, loc='center', cellLoc='center', colWidths=[0.45]*len(result2_df.columns),
                 rowLabels=[""] + list(result2_df["Classifier"]), rowLoc='center')

# Increase row height
table.scale(1, 2.5)  # You can adjust the scaling factor as needed

# Style the table
table.auto_set_font_size(False)
table.set_fontsize(12)

# Hide the axis
ax.axis('off')

# Save the table as a PNG image
table_save_path = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result2\\result2_table_wide.png"
plt.savefig(table_save_path, bbox_inches='tight', dpi=300)

# Show the table (optional)
plt.show()


# Hyperparameter Tuning

In [None]:
### Manual Hyperparameter Tuning
model=RandomForestClassifier(n_estimators=300,criterion='entropy',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

In [None]:
### Manual Hyperparameter Tuning 2
model=RandomForestClassifier(n_estimators=500,criterion='gini',
                             max_features='sqrt',min_samples_leaf=10,random_state=100).fit(X_train,y_train)
predictions=model.predict(X_test)
print(confusion_matrix(y_test,predictions))
print(accuracy_score(y_test,predictions))
print(classification_report(y_test,predictions))

##### Randomized Search 

In [None]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 1000,10)]
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10,14]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4,6,8]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

In [None]:
rf=RandomForestClassifier()
rf_randomcv=RandomizedSearchCV(estimator=rf,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(X_train,y_train)

In [None]:
rf_randomcv.best_params_

In [None]:
rf_randomcv

In [None]:
rf_randomcv.best_estimator_

In [None]:
best_random_grid=rf_randomcv.best_estimator_

In [None]:
from sklearn.metrics import accuracy_score
best_random_grid.fit(X_train, y_train)
y_pred=best_random_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

#### Grid Search 

In [None]:
rf_randomcv.best_params_

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': [rf_randomcv.best_params_['criterion']],
    'max_depth': [rf_randomcv.best_params_['max_depth']],
    'max_features': [rf_randomcv.best_params_['max_features']],
    'min_samples_leaf': [rf_randomcv.best_params_['min_samples_leaf'], 
                         rf_randomcv.best_params_['min_samples_leaf']+2, 
                         rf_randomcv.best_params_['min_samples_leaf'] + 4],
    'min_samples_split': [rf_randomcv.best_params_['min_samples_split'] - 2,
                          rf_randomcv.best_params_['min_samples_split'] - 1,
                          rf_randomcv.best_params_['min_samples_split'], 
                          rf_randomcv.best_params_['min_samples_split'] +1,
                          rf_randomcv.best_params_['min_samples_split'] + 2],
    'n_estimators': [rf_randomcv.best_params_['n_estimators'] - 200, rf_randomcv.best_params_['n_estimators'] - 100, 
                     rf_randomcv.best_params_['n_estimators'], 
                     rf_randomcv.best_params_['n_estimators'] + 100, rf_randomcv.best_params_['n_estimators'] + 200]
}

print(param_grid)

In [None]:
%%capture
# Your long-running code here
#### Fit the grid_search to the data
rf=RandomForestClassifier()
grid_search=GridSearchCV(estimator=rf,param_grid=param_grid,cv=10,n_jobs=-1,verbose=2)
grid_search.fit(X_train,y_train)

In [None]:
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Read the data from the text file into a DataFrame
df = pd.read_csv("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result3\\grid_search_results.txt", delimiter="\t", header=0)  # Adjust the delimiter if needed

# Sort by mean_test_score in descending order
sorted_results = df.sort_values(by='mean_test_score', ascending=False)

# The best hyperparameter combination should be in the first row
best_hyperparameters = sorted_results.iloc[0]

# Extract the best hyperparameters
criterion = best_hyperparameters['param_criterion']
max_depth = best_hyperparameters['param_max_depth']
max_features = best_hyperparameters['param_max_features']
min_samples_leaf = best_hyperparameters['param_min_samples_leaf']
min_samples_split = best_hyperparameters['param_min_samples_split']
n_estimators = best_hyperparameters['param_n_estimators']

# Create the best estimator with the best hyperparameters
best_estimator = RandomForestClassifier(criterion=criterion, max_depth=max_depth, max_features=max_features, min_samples_leaf=min_samples_leaf, min_samples_split=min_samples_split, n_estimators=n_estimators)

# Fit the best estimator on the training data
best_estimator.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_estimator.predict(X_test)

# Print confusion matrix, accuracy score, and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print the results
print("Best Hyperparameters:")
print("Criterion:", criterion)
print("Max Depth:", max_depth)
print("Max Features:", max_features)
print("Min Samples Leaf:", min_samples_leaf)
print("Min Samples Split:", min_samples_split)
print("N Estimators:", n_estimators)

print("\nConfusion Matrix:")
print(conf_matrix)

print("\nAccuracy Score:", accuracy)

print("\nClassification Report:")
print(class_report)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Read the grid search results from the text file
df = pd.read_csv("C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result3\\grid_search_results.txt", delimiter="\t", header=0)

# Filter rows with 'criterion' equal to 'gini' and 'entropy'
gini_df = df[df['param_criterion'] == 'gini']
entropy_df = df[df['param_criterion'] == 'entropy']

# Sort the data by 'param_n_estimators' for both Gini and Entropy
gini_df = gini_df.sort_values(by='param_n_estimators')
entropy_df = entropy_df.sort_values(by='param_n_estimators')

# Extract the number of estimators and mean test scores
gini_n_estimators = gini_df['param_n_estimators']
gini_mean_test_score = gini_df['mean_test_score']

entropy_n_estimators = entropy_df['param_n_estimators']
entropy_mean_test_score = entropy_df['mean_test_score']

# Create a colorful line graph to compare Gini and Entropy
plt.figure(figsize=(10, 6))
plt.plot(gini_n_estimators, gini_mean_test_score, label='Gini', marker='o', color='blue')
plt.plot(entropy_n_estimators, entropy_mean_test_score, label='Entropy', marker='o', color='orange')

# Set labels and title
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Test Score')
plt.title('Comparison of Gini and Entropy in Grid Search')
plt.legend()

# Save the graph as an image
save_path = "C:\\Shanila\\CSE\\cse445\\final_project\\git_project\\Machine_Learning_Project\\sha\\result2\\grid_search_comparison.png"
plt.savefig(save_path, bbox_inches='tight')

# Show the graph (optional)
plt.show()


In [None]:
grid_search.best_estimator_

In [None]:
best_grid=grid_search.best_estimator_

In [None]:
best_grid

In [None]:
best_grid.fit(X_train, y_train)
y_pred=best_grid.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
print("Classification report: {}".format(classification_report(y_test,y_pred)))

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

In [None]:
# Import necessary libraries

# Load the results from the specified file
results_file = r'C:\Shanila\CSE\cse445\final_project\git_project\Machine_Learning_Project\sha\result3\grid_search_results.txt'

with open(results_file, 'r') as file:
    results_data = file.read()

# Split the data into Gini and Entropy sections based on a delimiter
delimiter = "####################"  # Adjust this delimiter as per your file structure
sections = results_data.split(delimiter)

# Ensure both Gini and Entropy sections are present
if len(sections) != 2:
    print("Both Gini and Entropy sections are not found in the file.")
else:
    gini_section, entropy_section = sections

    # Extract and process Gini and Entropy results
    def process_results(section):
        lines = section.strip().split('\n')[1:]  # Skip the header line
        results = {}
        for line in lines:
            parts = line.split()
            param_name = parts[0][:-1]
            param_value = float(parts[1])
            results[param_name] = param_value
        return results

    gini_results = process_results(gini_section)
    entropy_results = process_results(entropy_section)

    # Compare the results
    comparison = {}
    for param_name in gini_results:
        gini_value = gini_results[param_name]
        entropy_value = entropy_results[param_name]
        comparison[param_name] = {
            'Gini': gini_value,
            'Entropy': entropy_value,
            'Difference': gini_value - entropy_value
        }

    # Save the comparison to a text file
    output_file = r'C:\Shanila\CSE\cse445\final_project\git_project\Machine_Learning_Project\sha\result2\comparison.txt'
    with open(output_file, 'w') as file:
        file.write("Parameter  Gini  Entropy  Difference\n")
        for param_name, values in comparison.items():
            file.write(f"{param_name}: {values['Gini']} {values['Entropy']} {values['Difference']}\n")


# KNeighborsClassifier() results in highest train and test accuracy(99.7%)

# LIME

In [None]:
#Use Lime for XAI
import lime
import lime.lime_tabular
import matplotlib.pyplot as plt


In [None]:
X_train.columns

In [None]:
explainer = lime.lime_tabular.LimeTabularExplainer(training_data = X_train.values, mode = 'classification',
                                                   feature_names = X_train.columns)

In [None]:
exp = explainer.explain_instance(X_test.iloc[0], knn.predict_proba)
exp.show_in_notebook(show_table = True)

In [None]:
save_fig("lime analysis") 

In [1]:
# save image

import lime
import lime.lime_tabular
import matplotlib.pyplot as plt


# Create an explainer
explainer = lime.lime_tabular.LimeTabularExplainer(training_data=X_train.values, mode='classification', feature_names=X_train.columns)

# Explain the prediction for the first instance in your test data
explanation = explainer.explain_instance(X_test.iloc[0], knn.predict_proba, num_features=len(X_train.columns))

# Show the explanation with a table
explanation.show_in_notebook(show_table=True)

# Save the explanation as an HD PNG
fig = explanation.as_pyplot_figure()
save_path = r'C:\Shanila\CSE\cse445\final_project\git_project\Machine_Learning_Project\sha\result4\lime_explanation_hd.png'
fig.savefig(save_path, dpi=300, bbox_inches='tight')
plt.close(fig)  # Close the figure to avoid displaying it

# Display the first few rows of your test data
X_test.head()


NameError: name 'X_train' is not defined

In [None]:
X_test.head()

In [None]:
exp.as_pyplot_figure()

In [None]:
save_fig("lime figure analysis") 