In [None]:
# First install PyMySQL
!pip install pymysql


In [None]:

import pymysql
import pandas as pd

try:
    # Establish connection
    conn = pymysql.connect(
        host='localhost',
        user='root',         # Your MySQL username
        password='',         # Your MySQL password
        database='',   # make sure the database in loaded in your(MySQL)
        port=3306
    )

    # Load data into DataFrame
    query = "SELECT * FROM table_name"  # Replace with your table
    df = pd.read_sql(query, conn)

    # Display results
    print("Connection successful! Here's your data:")
    display(df.head())

except pymysql.Error as e:
    print(f"MySQL Error: {e}")

finally:
    if 'conn' in locals():
        conn.close()
        print("Connection closed")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

# Load the dataset
 # Make sure this file exists in the same directory

# Drop irrelevant columns
df = df.drop(columns=["MyUnknownColumn", "Patient_ID"])

# ----- EDA (Exploratory Data Analysis) -----

# Plot tumor type distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x="Tumor_Type", palette="coolwarm")
plt.title("Tumor Type Distribution")
plt.xlabel("Tumor Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot distribution of age
plt.figure(figsize=(6, 4))
sns.histplot(df["Age"], bins=20, kde=True, color="teal")
plt.title("Age Distribution")
plt.xlabel("Age")
plt.tight_layout()
plt.show()

# Tumor Size vs Survival Rate
plt.figure(figsize=(6, 4))
sns.scatterplot(data=df, x="Tumor_Size", y="Survival_Rate", hue="Tumor_Type", palette="Set2")
plt.title("Tumor Size vs Survival Rate")
plt.tight_layout()
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
corr = df.select_dtypes(include=['number']).corr()
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()

# ----- Label Encoding -----
label_encoders = {}
for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Define features and target
X = df.drop("Tumor_Type", axis=1)
y = df["Tumor_Type"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Classifier
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)

# Evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save model
with open("random_forest_classifier.pkl", "wb") as f:
    pickle.dump(clf, f)

print("✅ Model saved as 'random_forest_classifier.pkl'")
