<a href="https://www.kaggle.com/code/benzilla987/ps-s3-ep22-eda-modeling-submission?scriptVersionId=143323028" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

#Importing Data
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/playground-series-s3e22/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s3e22/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s3e22/sample_submission.csv')

train.head()

# Initial Exporatory Analysis

In [None]:
# Deaing with null values and converting categorical columns to appropriate datatype.
# List of categorical columns to process
categorical_columns = [
    'temp_of_extremities',
    'peripheral_pulse',
    'mucous_membrane',
    'capillary_refill_time',
    'pain',
    'peristalsis',
    'abdominal_distention',
    'nasogastric_tube',
    'nasogastric_reflux',
    'rectal_exam_feces',
    'abdomen',
    'abdomo_appearance',
    'surgery',
    'surgical_lesion',
    'cp_data',
    'outcome',
    'age'
]

# Loop through the selected categorical columns
for column in categorical_columns:
    # Fill missing values with the mode
    mode_value = train[column].mode()[0]
    train[column].fillna(mode_value, inplace=True)
    
    # Convert the data type to categorical
    train[column] = train[column].astype('category')

## Dealing with outliers in numerical data

## Rectal Temp
# Calculate the Z-scores for 'rectal_temp'
z_scores = np.abs(stats.zscore(train['rectal_temp']))

# Define a threshold for identifying outliers (e.g., 3 standard deviations)
threshold = 3

# Identify outliers based on the Z-scores
outliers_mask = z_scores > threshold

# Impute outliers with the mean value of 'rectal_temp'
mean_rectal_temp = train['rectal_temp'].mean()
train.loc[outliers_mask, 'rectal_temp'] = mean_rectal_temp

# Optionally, you can create a new DataFrame with only the outliers
outliers_df = train[outliers_mask]

## Pulse
# Calculate the 99th percentile value
percentile_99 = np.percentile(train['pulse'], 99)

# Replace values above the 99th percentile with the 99th percentile value
train['pulse'][train['pulse'] > percentile_99] = percentile_99

## Respiratory Rate
# Calculate the 99th percentile value
percentile_99 = np.percentile(train['respiratory_rate'], 99)

# Replace values above the 99th percentile with the 99th percentile value
train['respiratory_rate'][train['respiratory_rate'] > percentile_99] = percentile_99

## Packed Cell Volume
# Calculating the Z-scores for 'rectal_temp'
z_scores = np.abs(stats.zscore(train['packed_cell_volume']))

# Defining a threshold for identifying outliers (e.g., 3 standard deviations)
threshold = 3

# Identifying outliers based on the Z-scores
outliers_mask = z_scores > threshold

# Imputing outliers with the mean value of 'rectal_temp'
mean_rectal_temp = train['packed_cell_volume'].mean()
train.loc[outliers_mask, 'packed_cell_volume'] = mean_rectal_temp

## Total Protein
# This distribution is broken up into clusters. So I'm using kmeans clustering to exmaine the distribution of the two clusters seperately

# Defining a function to impute outliers in a cluster with the cluster's mean
def impute_outliers_with_mean(cluster_data):
    cluster_mean = cluster_data.mean()
    cluster_data[cluster_data < percentiles[0]] = cluster_mean
    cluster_data[cluster_data > percentiles[4]] = cluster_mean
    return cluster_data

# Performing k-means clustering to identify clusters
kmeans = KMeans(n_clusters=2)  # Adjust the number of clusters as needed
train['cluster'] = kmeans.fit_predict(train[['total_protein']])

# Iterating through each cluster
for cluster_id in train['cluster'].unique():
    # Filtering the data for the current cluster
    cluster_data = train[train['cluster'] == cluster_id]['total_protein']
    
    # Calculating and printing the percentiles for the cluster
    percentiles = np.percentile(cluster_data, [25, 50, 75, 95, 99])
    
    # Identifying potential outliers within the cluster
    potential_outliers = cluster_data[(cluster_data < percentiles[0]) | (cluster_data > percentiles[4])]
    
    # Imputing potential outliers with the cluster's mean
    train.loc[train['cluster'] == cluster_id, 'total_protein'] = impute_outliers_with_mean(cluster_data)

## Abdominal Proptein
# Calculate the 99th percentile value
percentile_99 = np.percentile(train['abdomo_protein'], 99)

# Replace values above the 99th percentile with the 99th percentile value
train['abdomo_protein'][train['abdomo_protein'] > percentile_99] = percentile_99

## Lesion 1
# Calculate the 99th percentile value
percentile_99 = np.percentile(train['lesion_1'], 99)

# Replace values above the 99th percentile with the 99th percentile value
train['lesion_1'][train['lesion_1'] > percentile_99] = percentile_99

## Lesion 2
# Calculate the 99th percentile value
percentile_99 = np.percentile(train['lesion_2'], 99)

# Replace values above the 99th percentile with the 99th percentile value
train['lesion_2'][train['lesion_2'] > percentile_99] = percentile_99
# This replaces all values with zero. Upon closer examination, there are only 8 non-zero values in this column.
# This suggests that this variable is not valuable in predictions.
# Dropping the variable
train.drop('lesion_2', axis=1, inplace=True)

## The same issue is present with lesion_3, dropping this variable as well.
train.drop('lesion_3', axis=1, inplace=True)
    
# Examining the shape of the data
print("Data Shape:\n")
print(train.shape,"\n")
# Summary statistics of numeric columns
print("Data Description:\n")
print(train.describe(),"\n")

# Data types and missing values
print("Data Types and Missing Values:\n")
print(train.info(),"\n")

# Count the number of unique values in each column
print("Number of unique values in each column:\n")
print(train.nunique(),"\n")

# Distribution of categorical variables
print("Distribution of categorical variables:\n")
categorical_columns = train.select_dtypes(include=['category']).columns
for column in categorical_columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=train, x=train[column])
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)
    plt.show()

# Distribution of numeric variables
numeric_columns = train.select_dtypes(include=['int64', 'float64']).columns
for column in numeric_columns:
    plt.figure(figsize=(8, 6))
    sns.histplot(data=train, x=train[column], kde=True)
    plt.title(f'Distribution of {column}')
    plt.show()

# Correlation matrix for numeric variables
correlation_matrix = train[numeric_columns].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
train.drop(columns=['cluster'], axis=1,inplace=True)
test.drop(columns=['lesion_2','lesion_3'], axis=1,inplace=True)

# Submission 1 - Simple Decision Tree

In [None]:
# ## Defining the features and target variable for training data
# X_train = train.drop("outcome", axis=1)
# y_train = train["outcome"]

# # Define the features for test data
# X_test = test

# # Handle categorical columns with one-hot encoding
# categorical_columns = X_train.select_dtypes(include=['category']).columns

# # Initialize the OneHotEncoder with handle_unknown='ignore'
# encoder = OneHotEncoder(drop='first', sparse=False, handle_unknown='ignore')
# X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train[categorical_columns]))
# X_test_encoded = pd.DataFrame(encoder.transform(X_test[categorical_columns]))

# # Impute missing values
# imputer = SimpleImputer(strategy='mean')
# X_train_imputed = pd.DataFrame(imputer.fit_transform(X_train.select_dtypes(exclude=['category'])))
# X_test_imputed = pd.DataFrame(imputer.transform(X_test.select_dtypes(exclude=['category'])))

# # Concatenate the encoded categorical columns and imputed numerical columns
# X_train_processed = pd.concat([X_train_encoded, X_train_imputed], axis=1)
# X_test_processed = pd.concat([X_test_encoded, X_test_imputed], axis=1)

# # Create and train a Decision Tree classifier
# clf = DecisionTreeClassifier(random_state=42)
# clf.fit(X_train_processed, y_train)

# # Make predictions on the test data
# y_pred = clf.predict(X_test_processed)

# # Create a DataFrame for the test predictions
# submission_1 = pd.DataFrame({'id': test['id'], 'outcome': y_pred})

# # Export the DataFrame to a CSV file
# submission_1.to_csv('submission_1.csv', index=False)



# Submission 2

In [None]:
## I went back and did some pre-processing. Then I hashed out my submission 1 code.
## Gonna try to use random forest this time. With better pre-processing, I'm expecting
## improved results. **fingers crossed!**

# Define X_train and y_train from your training data
X_train = train.drop('outcome', axis=1)  # Assuming 'outcome' is your target variable
y_train = train['outcome']

# Define X_test from your testing data
X_test = test  # You can modify this according to your data preprocessing steps

# Concatenate the training and testing datasets for one-hot encoding
combined_data = pd.concat([train, test], axis=0)

# Perform one-hot encoding on the combined data
combined_data_encoded = pd.get_dummies(combined_data, columns=categorical_columns, drop_first=True)

# Split the combined data back into training and testing datasets
X_train_encoded = combined_data_encoded[:len(train)]
X_test_encoded = combined_data_encoded[len(train):]

# Creating a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Training the classifier on the training data
rf_classifier.fit(X_train_encoded, y_train)

# Making predictions on the test data
y_pred = rf_classifier.predict(X_test_encoded)

# Creating a DataFrame with 'id' and 'outcome' columns
submission_2 = pd.DataFrame({'id': test['id'], 'outcome': y_pred})

# Generating submission
submission_2.to_csv('submission_2.csv', index=False)