# SVM Model with scikit-learn


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from svm_scratch import LinearSVM_Dual
from sklearn.model_selection import GridSearchCV

In [14]:

df_val = pd.read_csv('../X_test_encoded.csv')
df_train = pd.read_csv('../X_train_encoded.csv')
X_val = df_val.drop('Depression', axis=1)
y_val = df_val['Depression']
X = df_train.drop('Depression', axis=1)
y = df_train['Depression']

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# Create pipeline with MinMaxScaler and SVM
pipe = Pipeline([
    ('scaler', MinMaxScaler()),
    ('svm', SVC(kernel='linear', random_state=42))
])

# Define parameter grid for pipeline
param_grid = {
    'svm__C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

# Use pipeline in GridSearchCV
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

# Use best model and evaluate on validation set
best_model = grid_search.best_estimator_
val_predictions = best_model.predict(X_val)

# Print validation results
print("\nValidation Set Results:")
print("Accuracy on validation set:", accuracy_score(y_val, val_predictions))
print("\nClassification Report on Validation Set:")
print(classification_report(y_val, val_predictions, digits=4))

Best parameters: {'svm__C': 1}
Best cross-validation accuracy: 0.8466706741290597

Validation Set Results:
Accuracy on validation set: 0.8469534050179212

Classification Report on Validation Set:
              precision    recall  f1-score   support

           0     0.8336    0.7882    0.8102      2313
           1     0.8556    0.8886    0.8718      3267

    accuracy                         0.8470      5580
   macro avg     0.8446    0.8384    0.8410      5580
weighted avg     0.8465    0.8470    0.8463      5580



In [16]:
import pandas as pd
import os
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

# After getting predictions and running classification_report
def save_metrics_to_csv(model_name, y_true, y_pred, filepath='../model_metrics.csv'):
    """
    Save model metrics to CSV file with each model as a row
    
    Parameters:
    -----------
    model_name : str
        Name of the model
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    filepath : str
        Path to CSV file
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Class 0 metrics
    precision_0 = precision_score(y_true, y_pred, pos_label=0)
    recall_0 = recall_score(y_true, y_pred, pos_label=0)
    f1_0 = f1_score(y_true, y_pred, pos_label=0)
    
    # Class 1 metrics
    precision_1 = precision_score(y_true, y_pred, pos_label=1)
    recall_1 = recall_score(y_true, y_pred, pos_label=1)
    f1_1 = f1_score(y_true, y_pred, pos_label=1)
    
    # Average metrics
    precision_avg = precision_score(y_true, y_pred, average='macro')
    recall_avg = recall_score(y_true, y_pred, average='macro')
    f1_avg = f1_score(y_true, y_pred, average='macro')
    
    # Create a dictionary with all metrics
    metrics_dict = {
        'model': model_name,
        'accuracy': accuracy,
        'precision_class0': precision_0,
        'recall_class0': recall_0,
        'f1_class0': f1_0,
        'precision_class1': precision_1,
        'recall_class1': recall_1,
        'f1_class1': f1_1,
        'precision_avg': precision_avg,
        'recall_avg': recall_avg,
        'f1_avg': f1_avg
    }
    
    # Check if file exists
    if os.path.exists(filepath):
        # Read existing data and append new row
        metrics_df = pd.read_csv(filepath)
        
        # Check if model already exists in the dataframe
        if model_name in metrics_df['model'].values:
            # Update existing row
            metrics_df.loc[metrics_df['model'] == model_name] = pd.Series(metrics_dict)
        else:
            # Append new row
            metrics_df = pd.concat([metrics_df, pd.DataFrame([metrics_dict])], ignore_index=True)
    else:
        # Create new dataframe
        metrics_df = pd.DataFrame([metrics_dict])
    
    # Save to CSV
    metrics_df.to_csv(filepath, index=False)
    print(f"Metrics saved to {filepath}")
    
    return metrics_df

# Use the function after evaluating your model
# Example usage after running the model:
save_metrics_to_csv("SVM_Linear", y_val, val_predictions)

Metrics saved to ../model_metrics.csv


Unnamed: 0,model,accuracy,precision_class0,recall_class0,f1_class0,precision_class1,recall_class1,f1_class1,precision_avg,recall_avg,f1_avg
0,SVM_Linear,0.846953,0.833562,0.788154,0.810222,0.855585,0.888583,0.871772,0.844573,0.838368,0.840997


## Custom SVM Implementation (from scratch)

### Train and Evaluate Custom SVM

In [17]:
# Định nghĩa siêu tham số
C_value = 1.0  # C nên chọn từ kết quả GridSearchCV tốt nhất
tol_value = 1e-3  # Tolerance thường dùng 1e-4 hoặc 1e-3
max_iter_value = 1000  # Số vòng lặp tối đa
X = X.head(3000)  # Chọn 3000 mẫu đầu tiên
y = y.head(3000)  # Chọn 3000 nhãn đầu tiên
X_train_array = X.to_numpy()
y_train_array = y.to_numpy()
# Huấn luyện và đánh giá mô hình
svm_scratch = LinearSVM_Dual(C=C_value, tol=tol_value, max_iter=max_iter_value)
svm_scratch.fit(X_train_array, y_train_array)  # Sử dụng toàn bộ training data

# Đánh giá trên tập validation
y_pred_scratch = svm_scratch.predict(X_val)
print('Accuracy (scratch):', accuracy_score(y_val, y_pred_scratch))
print('\nClassification Report:')
print(classification_report(y_val, y_pred_scratch, digits=4))

Accuracy (scratch): 0.843010752688172

Classification Report:
              precision    recall  f1-score   support

           0     0.8195    0.7968    0.8080      2313
           1     0.8589    0.8757    0.8672      3267

    accuracy                         0.8430      5580
   macro avg     0.8392    0.8363    0.8376      5580
weighted avg     0.8426    0.8430    0.8427      5580



In [19]:
save_metrics_to_csv("SVM_Linear_scratch",y_val, y_pred_scratch)

Metrics saved to ../model_metrics.csv


Unnamed: 0,model,accuracy,precision_class0,recall_class0,f1_class0,precision_class1,recall_class1,f1_class1,precision_avg,recall_avg,f1_avg
0,SVM_Linear,0.846953,0.833562,0.788154,0.810222,0.855585,0.888583,0.871772,0.844573,0.838368,0.840997
1,logictics_Regression_lib,0.846057,0.830455,0.789883,0.809661,0.856213,0.885828,0.870769,0.843334,0.837856,0.840215
2,decision_tree_lib,0.824731,0.81061,0.753134,0.780816,0.833576,0.875421,0.853986,0.822093,0.814278,0.817401
3,LogisticRegression_Scratch,0.841219,0.832634,0.772157,0.801256,0.846579,0.890113,0.867801,0.839607,0.831135,0.834528
4,SVM_Linear_scratch,0.843011,0.819475,0.796801,0.807979,0.858901,0.875727,0.867232,0.839188,0.836264,0.837606
