##Week 1

In [1]:
# import all the libraries we will be using

import pandas as pd # helps manipulate the dataset
import numpy as np # numerical operations
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # data visualization

# just a styling setup for plots and charts
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (8,5)

In [3]:
# load the dataset into the code after uploading it in the files section in the left panel
df = pd.read_csv('/content/pdb_data_no_dups.csv')

# inspect columns and see the basic shap and preview
print("Shape:", df.shape)
df.head()

Shape: (99529, 14)


Unnamed: 0,structureId,classification,experimentalTechnique,macromoleculeType,residueCount,resolution,structureMolecularWeight,crystallizationMethod,crystallizationTempK,densityMatthews,densityPercentSol,pdbxDetails,phValue,publicationYear
0,100D,DNA-RNA HYBRID,X-RAY DIFFRACTION,DNA/RNA Hybrid,20.0,1.9,6360.3,"VAPOR DIFFUSION, HANGING DROP",,1.78,30.89,"pH 7.00, VAPOR DIFFUSION, HANGING DROP",7.0,1994.0
1,101D,DNA,X-RAY DIFFRACTION,DNA,24.0,2.25,7939.35,,,2.0,38.45,,,1995.0
2,101M,OXYGEN TRANSPORT,X-RAY DIFFRACTION,Protein,154.0,2.07,18112.8,,,3.09,60.2,"3.0 M AMMONIUM SULFATE, 20 MM TRIS, 1MM EDTA, ...",9.0,1999.0
3,102D,DNA,X-RAY DIFFRACTION,DNA,24.0,2.2,7637.17,"VAPOR DIFFUSION, SITTING DROP",277.0,2.28,46.06,"pH 7.00, VAPOR DIFFUSION, SITTING DROP, temper...",7.0,1995.0
4,102L,HYDROLASE(O-GLYCOSYL),X-RAY DIFFRACTION,Protein,165.0,1.74,18926.61,,,2.75,55.28,,,1993.0


Upload our dataset on the leftside panel, then inspect the columns.

In [None]:
# shows each column type whether its numeric or text
print("df.info: ")
print(df.info())

# statistical summaries for all numeric features
print("\ndf.describe: ")
print(df.describe().T)

# shows missing values count per column
print("\ndf.isnull: ")
print(df.isnull().sum().sort_values(ascending=False))


Check the column info and missing values! NaNs are gaps that can break models later so we don't want those!

In [None]:
# Replace blanks, "?" or other placeholders with NaN
df.replace({'?': np.nan, 'None': np.nan, ' ': np.nan, '': np.nan}, inplace=True)

# Fill numeric columns with median, categorical with 'Unknown'
num_cols = df.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df.select_dtypes(include='object').columns

df[num_cols] = df[num_cols].fillna(df[num_cols].median())
df[cat_cols] = df[cat_cols].fillna('Unknown')

# Double check if anything is still missing
df.isnull().sum().sum()

replace all the NaN values with the median values instead of skewed data

In [None]:
# Histogram: Residue Count
sns.histplot(df['residueCount'], bins=40, kde=True)
plt.title('Residue Count Distribution')
plt.xlabel('Residue Count')
plt.show()

# Histogram: Resolution
sns.histplot(df['resolution'], bins=30, kde=True)
plt.title('Resolution (√Ö) Distribution')
plt.xlabel('Resolution (√Ö)')
plt.show()

# Histogram: Publication Year
sns.histplot(df['publicationYear'], bins=30)
plt.title('Publication Year Trends')
plt.xlabel('Year')
plt.show()

NameError: name 'df' is not defined

In [None]:
# Select only numeric columns for correlation
num_cols = ['residueCount', 'resolution', 'structureMolecularWeight',
            'densityMatthews', 'densityPercentSol', 'phValue', 'publicationYear']

num_cols = [c for c in num_cols if c in df.columns]
corr = df[num_cols].corr()

# Plot heatmap
plt.figure(figsize=(9,7))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Heatmap (Numeric Features)')
plt.show()

This heatmap shows that there is a strong correlation between residueCount and structureMolecularWeight --> bigger proteins weigh more

Also shows resolution and residueCount have a weak or negative correlation --> large proteins often have lower-quality structures

##Week 2

In [None]:
# 1Ô∏è Recheck which columns are numeric vs categorical
print("\nChecking data types again to confirm which features are categorical and which are numeric:\n")
print(df.dtypes)

# Select categorical (text/object) and numeric (int/float) columns
categorical_cols = df.select_dtypes(include='object').columns.tolist()
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print("\nCategorical columns:")
print(categorical_cols)
print("\nNumeric columns:")
print(numeric_cols)
print("Remaining NaNs:", df.isnull().sum().sum())

for col in categorical_cols:
    print(f"\nüîπ {col}:")
    print(f"Number of unique categories: {df[col].nunique()}")
    print("Sample values:", df[col].dropna().unique()[:5])  # show first few unique examples


Which columns should we encode? (Discussion)

In [None]:
# We only encode columns used in modeling ‚Äî not IDs or metadata.
encode_cols = ['experimentalTechnique', 'macromoleculeType', 'crystallizationMethod']

print("\nWe will encode the following categorical columns as input features:")
print(encode_cols)

###Label Encoding!


Label encoding for target variable (classification)

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df['classification_encoded'] = label_encoder.fit_transform(df['classification'].astype(str))

print("\n‚úÖ 'classification' column encoded successfully!")
label_mapping = dict(zip(label_encoder.classes_[:5], label_encoder.transform(label_encoder.classes_[:5])))
print("Sample mapping (first few classes):", label_mapping)

# Show before/after for this column
print("\nBefore and after encoding (sample):")
print(df[['classification', 'classification_encoded']].head())

###One-Hot Encoding

In [None]:
df_encoded = pd.get_dummies(df, columns=encode_cols)

print("\n‚úÖ One-hot encoding complete!")
print("Shape before encoding:", df.shape)
print("Shape after encoding:", df_encoded.shape)

# Show new column examples
encoded_example_cols = [col for col in df_encoded.columns if any(base in col for base in encode_cols)]
print("\nExample of new encoded columns:")
print(encoded_example_cols[:10])

# Display head of encoded section
print("\nShowing first few rows after encoding:")
df_encoded.head()

###Normalization and Scaling

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# Columns to scale
scale_features_demo = ['crystallizationTempK', 'phValue']

# Keep only existing columns
scale_features_demo = [f for f in scale_features_demo if f in df_encoded.columns]

# Save a copy for before/after comparison
before_scaling_demo = df_encoded[scale_features_demo].copy()
# Initialize scaler
scaler_demo = MinMaxScaler()

# Fit and transform
df_encoded[scale_features_demo] = scaler_demo.fit_transform(df_encoded[scale_features_demo])

print("\n‚úÖ Demo numeric feature normalization complete!")
print("Scaled columns:", scale_features_demo)

# Create a comparison dataframe
comparison_demo = pd.DataFrame({
    f"{col} (before)": before_scaling_demo[col].head() for col in scale_features_demo
})
for col in scale_features_demo:
    comparison_demo[f"{col} (after)"] = df_encoded[col].head().values

print("\nüìä Before vs After Scaling (first 5 rows):")
display(comparison_demo)

# Optional: Check min/max ranges after scaling
print("\nMin/Max after scaling:")
print(df_encoded[scale_features_demo].agg(['min','max']))


NameError: name 'df_encoded' is not defined

#Week 3

##Test/Train Split

In [None]:
# Select features and target
X = df_encoded.drop(columns=['classification', 'classification_encoded', 'structureId'])
y = df_encoded['classification_encoded']

# Encode all object (text) columns
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
print(df_encoded.head())
print(df_encoded.columns)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Scale features
numeric_cols = X_train.select_dtypes(include='number').columns
scaler = MinMaxScaler()
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])


##Logistic Regression

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1Ô∏è‚É£ Use a smaller sample for fast training
X_train_sample = X_train.sample(5000, random_state=42)
y_train_sample = y_train.loc[X_train_sample.index]

X_test_sample = X_test.sample(2000, random_state=42)
y_test_sample = y_test.loc[X_test_sample.index]

# 2Ô∏è‚É£ Train Logistic Regression via SGD
log_model = SGDClassifier(
    loss='log_loss',   # Logistic Regression
    max_iter=50,       # Fewer iterations for faster training
    tol=1e-3,          # Stop when improvement is small
    random_state=42
)
log_model.fit(X_train_sample, y_train_sample)

# 3Ô∏è‚É£ Predictions & Evaluation
log_preds = log_model.predict(X_test_sample)
print("Accuracy:", accuracy_score(y_test_sample, log_preds))
print(classification_report(y_test_sample, log_preds, zero_division=0))


##Decision Tree

In [None]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
tree_preds = tree_model.predict(X_test)

print("\nüîπ Decision Tree Results:")
print("Accuracy:", accuracy_score(y_test, tree_preds))
print(classification_report(y_test, tree_preds))

In [None]:
# --- Confusion Matrix Visualization for Decision Tree ---
plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, tree_preds), cmap='Blues', cbar=False)
plt.title("Decision Tree Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

##Random Forest

In [None]:
# Step 2: 20% of data, better model
rf_model = RandomForestClassifier(
    n_estimators=30,
    max_depth=12,
    min_samples_split=5,
    max_samples=0.3,
    n_jobs=1,
    random_state=42
)

X_train_medium, _, y_train_medium, _ = train_test_split(
    X_train, y_train,
    train_size=0.20,       # 20% of training data
    random_state=42
)

rf_model.fit(X_train_medium, y_train_medium)
rf_preds = rf_model.predict(X_test)

print("\nüîπ Random Forest Results (20% data):")
print("Accuracy:", accuracy_score(y_test, rf_preds))

In [None]:
# --- Feature Importance (from Random Forest) ---
importances = pd.Series(rf_model.feature_importances_, index=X.columns)
top_features = importances.sort_values(ascending=False).head(10)
print("\nTop 10 Most Important Features:")
print(top_features)

plt.figure(figsize=(8,4))
sns.barplot(x=top_features.values, y=top_features.index)
plt.title("Top 10 Important Features (Random Forest)")
plt.show()

#Week 4

## Hyper parameter tuning

In [None]:
# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],        # Number of trees
    'max_depth': [10, 20, 30, None],       # Maximum depth of trees
    'min_samples_split': [2, 5, 10],       # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4]          # Minimum samples at leaf node
}

print("üîç Parameter Grid:")
for param, values in param_grid.items():
    print(f"  {param}: {values}")

total_combinations = np.prod([len(v) for v in param_grid.values()])
print(f"\nüìä Total combinations to test: {total_combinations}")
print(f"With 5-fold CV, total fits: {total_combinations * 5}")

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score

# Initialize GridSearchCV
print("\nüöÄ Starting GridSearchCV...\n")

grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
    param_grid=param_grid,
    cv=5,                    # 5-fold cross-validation
    scoring='accuracy',      # Optimize for accuracy
    verbose=2,               # Show progress
    n_jobs=-1                # Use all CPU cores
)

# Fit GridSearchCV (this will take a few minutes)
grid_search.fit(X_train, y_train)

print("\n‚úÖ GridSearchCV complete!")

In [None]:
# Display the best hyperparameters found
print("\nüèÜ Best Hyperparameters Found:")
print("="*50)
for param, value in grid_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"\nüìà Best Cross-Validation Score: {grid_search.best_score_:.4f}")

# Get the best model
best_rf = grid_search.best_estimator_

# Evaluate on test set
tuned_preds = best_rf.predict(X_test)
tuned_accuracy = accuracy_score(y_test, tuned_preds)

print(f"\nüéØ Test Set Performance:")
print(f"  Baseline Accuracy: {baseline_accuracy:.4f}")
print(f"  Tuned Accuracy:    {tuned_accuracy:.4f}")
print(f"  Improvement:       {(tuned_accuracy - baseline_accuracy):.4f} ({((tuned_accuracy - baseline_accuracy) / baseline_accuracy * 100):.2f}%)")

In [None]:
# Convert GridSearchCV results to a DataFrame for analysis
results_df = pd.DataFrame(grid_search.cv_results_)

# Display top 10 parameter combinations
print("\nüìä Top 10 Parameter Combinations:\n")
top_results = results_df[['params', 'mean_test_score', 'std_test_score']].sort_values(
    'mean_test_score', ascending=False
).head(10)

for idx, row in top_results.iterrows():
    print(f"Rank {idx+1}: Score = {row['mean_test_score']:.4f} (+/- {row['std_test_score']:.4f})")
    print(f"  Parameters: {row['params']}\n")

## Input Function

In [None]:
def predict_protein_classification(residue_count, resolution, molecular_weight,
                                   crystallization_temp, density_matthews,
                                   density_percent_sol, ph_value, publication_year,
                                   experimental_technique='X-RAY DIFFRACTION',
                                   macromolecule_type='Protein',
                                   crystallization_method='VAPOR DIFFUSION, HANGING DROP'):
    """
    Predicts the classification of a protein structure based on its metadata.

    Parameters:
    -----------
    residue_count : int
        Number of residues in the protein
    resolution : float
        Resolution of the structure in Angstroms
    molecular_weight : float
        Molecular weight of the structure
    crystallization_temp : float
        Crystallization temperature in Kelvin
    density_matthews : float
        Matthews coefficient
    density_percent_sol : float
        Percent solvent content
    ph_value : float
        pH value during crystallization
    publication_year : int
        Year of publication
    experimental_technique : str, optional
        Experimental technique used (default: 'X-RAY DIFFRACTION')
    macromolecule_type : str, optional
        Type of macromolecule (default: 'Protein')
    crystallization_method : str, optional
        Method used for crystallization (default: 'VAPOR DIFFUSION, HANGING DROP')

    Returns:
    --------
    dict : Dictionary containing prediction and probabilities
    """

    # Create a dictionary with the input values
    input_data = {
        'residueCount': residue_count,
        'resolution': resolution,
        'structureMolecularWeight': molecular_weight,
        'crystallizationTempK': crystallization_temp,
        'densityMatthews': density_matthews,
        'densityPercentSol': density_percent_sol,
        'phValue': ph_value,
        'publicationYear': publication_year
    }

    # Create a DataFrame with all feature columns (matching training data)
    input_df = pd.DataFrame([input_data])

    # Add categorical columns with one-hot encoding
    # Initialize all one-hot encoded columns as False
    for col in X_train.columns:
        if col not in input_df.columns:
            input_df[col] = False

    # Set the appropriate one-hot encoded columns to True
    exp_tech_col = f'experimentalTechnique_{experimental_technique}'
    macro_type_col = f'macromoleculeType_{macromolecule_type}'
    cryst_method_col = f'crystallizationMethod_{crystallization_method}'

    if exp_tech_col in input_df.columns:
        input_df[exp_tech_col] = True
    if macro_type_col in input_df.columns:
        input_df[macro_type_col] = True
    if cryst_method_col in input_df.columns:
        input_df[cryst_method_col] = True

    # Reorder columns to match training data
    input_df = input_df[X_train.columns]

    # Make prediction
    prediction = best_rf.predict(input_df)[0]
    prediction_proba = best_rf.predict_proba(input_df)[0]

    # Get top 3 predictions with probabilities
    classes = best_rf.classes_
    top_3_indices = np.argsort(prediction_proba)[-3:][::-1]

    top_predictions = [
        (classes[idx], prediction_proba[idx])
        for idx in top_3_indices
    ]

    return {
        'predicted_classification': prediction,
        'confidence': prediction_proba.max(),
        'top_3_predictions': top_predictions,
        'input_summary': {
            'residue_count': residue_count,
            'resolution': resolution,
            'molecular_weight': molecular_weight,
            'macromolecule_type': macromolecule_type
        }
    }

print("‚úÖ Prediction function created successfully!")

In [None]:
def display_prediction(result, example_name):
    """
    Display prediction results in a nice format.
    """
    print("\n" + "="*70)
    print(f"üî¨ {example_name}")
    print("="*70)

    # Input summary
    print("\nüì• Input Features:")
    for key, value in result['input_summary'].items():
        print(f"  {key.replace('_', ' ').title()}: {value}")

    # Main prediction
    print(f"\nüéØ Predicted Classification: {result['predicted_classification']}")
    print(f"üìä Confidence: {result['confidence']:.2%}")

    # Top 3 predictions
    print("\nüèÜ Top 3 Predictions:")
    for i, (classification, prob) in enumerate(result['top_3_predictions'], 1):
        bar = '‚ñà' * int(prob * 50)
        print(f"  {i}. {classification:30s} {prob:6.2%} {bar}")

    print("="*70)

print("‚úÖ Display function created successfully!")

In [None]:
# Example 1: A small DNA structure
result1 = predict_protein_classification(
    residue_count=24,
    resolution=2.0,
    molecular_weight=8000,
    crystallization_temp=277,
    density_matthews=2.0,
    density_percent_sol=40,
    ph_value=7.0,
    publication_year=2020,
    experimental_technique='X-RAY DIFFRACTION',
    macromolecule_type='DNA',
    crystallization_method='VAPOR DIFFUSION, HANGING DROP'
)

display_prediction(result1, "Example 1: Small DNA Structure")