In [1]:
'''
Q.1) With given data sets apply the concept of handling missing and null values using the built-in functions of Python library. 

'''
import pandas as pd

diabetes_df = pd.read_csv('diabetes.csv')
cancer_df = pd.read_csv('cancer.csv')

columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in columns_to_replace:
    diabetes_df[column] = diabetes_df[column].replace(0, diabetes_df[column].median())

cancer_df_cleaned = cancer_df.drop(columns=['Unnamed: 32'])

diabetes_df.to_csv('diabetes_cleaned.csv', index=False)
cancer_df_cleaned.to_csv('cancer_cleaned.csv', index=False)

print("Missing and null values handled. Cleaned datasets saved as 'diabetes_cleaned.csv' and 'cancer_cleaned.csv'.")

Missing and null values handled. Cleaned datasets saved as 'diabetes_cleaned.csv' and 'cancer_cleaned.csv'.


In [2]:
'''
Q.2) With given data sets apply the Naïve Bayes Classifier for the given data sets. 
(i) Tabulate the following information regarding each data set: 
(a) Purpose of the data sets. 
(b) Dimensions of the data sets. 
(c) Column Names and data types of the data sets. 
(ii) Use the train_test-split function of python to split the given data sets using the following  
test_size values: {0.3, 0.4, 0.5}. 
(iv) Keep the maximum epochs value = 10. 
(v) Tabulate the following in an excel sheet the outputs according the above combinations. 
(vi) Display the classification report and confusion matrices of each combination of the above  
parameters.

'''
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix

diabetes_df = pd.read_csv('diabetes.csv')
cancer_df = pd.read_csv('cancer.csv')

def tabulate_info(df):
    info = {
        'Purpose': 'Diabetes Classification' if 'Outcome' in df.columns else 'Cancer Classification',
        'Dimensions': df.shape,
        'Column Names and Data Types': df.dtypes.to_dict()
    }
    return info

diabetes_info = tabulate_info(diabetes_df)
cancer_info = tabulate_info(cancer_df)

diabetes_info_df = pd.DataFrame([diabetes_info])
cancer_info_df = pd.DataFrame([cancer_info])

test_sizes = [0.3, 0.4, 0.5]

def process_dataset(df, target_column, dataset_name, exclude_columns):
    results = []

    for test_size in test_sizes:
        X = df.drop(columns=[target_column] + exclude_columns)
        y = df[target_column]
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        
        model = GaussianNB()
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        
        report = classification_report(y_test, y_pred, output_dict=True)
        conf_matrix = confusion_matrix(y_test, y_pred)
        
        results.append({
            'Test Size': test_size,
            'Classification Report': str(report),
            'Confusion Matrix': str(conf_matrix)
        })
    
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'{dataset_name}_results.csv', index=False)

diabetes_info_df.to_csv('diabetes_info.csv', index=False)
cancer_info_df.to_csv('cancer_info.csv', index=False)

process_dataset(diabetes_df, 'Outcome', 'diabetes', [])
process_dataset(cancer_df, 'diagnosis', 'cancer', ['id', 'Unnamed: 32'])

print("Diabetes Dataset Information:", diabetes_info)
print("Cancer Dataset Information:", cancer_info)

Diabetes Dataset Information: {'Purpose': 'Diabetes Classification', 'Dimensions': (768, 9), 'Column Names and Data Types': {'Pregnancies': dtype('int64'), 'Glucose': dtype('int64'), 'BloodPressure': dtype('int64'), 'SkinThickness': dtype('int64'), 'Insulin': dtype('int64'), 'BMI': dtype('float64'), 'DiabetesPedigreeFunction': dtype('float64'), 'Age': dtype('int64'), 'Outcome': dtype('int64')}}
Cancer Dataset Information: {'Purpose': 'Cancer Classification', 'Dimensions': (569, 33), 'Column Names and Data Types': {'id': dtype('int64'), 'diagnosis': dtype('O'), 'radius_mean': dtype('float64'), 'texture_mean': dtype('float64'), 'perimeter_mean': dtype('float64'), 'area_mean': dtype('float64'), 'smoothness_mean': dtype('float64'), 'compactness_mean': dtype('float64'), 'concavity_mean': dtype('float64'), 'concave points_mean': dtype('float64'), 'symmetry_mean': dtype('float64'), 'fractal_dimension_mean': dtype('float64'), 'radius_se': dtype('float64'), 'texture_se': dtype('float64'), 'peri