# Assignment 4 (Forest Cover)

- Name: **Arnab Sen**
- Roll: **510519006**
- Date: **Sept 5, 2022**

In [None]:
from google.colab import drive
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
import numpy as np

In [None]:
drive.mount('/content/drive')
BASE_PATH = '/content/drive/MyDrive/Colab_Notebooks/ML_DRIVE/Assign_4/dataset'

In [None]:
dataset = pd.read_csv(f"{BASE_PATH}/train_and_test2.csv")
print("Dataset shape:", dataset.shape)
print("Dataset columns:", dataset.columns)

In [None]:
# Let's check for missing values once
dataset.isnull().sum()

In [None]:
columns_to_drop = ['Passengerid', 'zero', 'zero.1', 'zero.2', 'zero.3', 'zero.4', 'zero.5', 'zero.6', 'zero.7', 'zero.8', 'zero.9', 'zero.10', 'zero.11', 'zero.12', 'zero.13', 'zero.14', 'zero.15', 'zero.16', 'zero.17', 'zero.18']
dataset = dataset.drop(columns_to_drop, axis=1)
dataset.info()

In [None]:
encoded_cols = ["Pclass", "Embarked"]
dataset = pd.get_dummies(dataset, columns=encoded_cols)
dataset.info()

In [None]:
scaled_cols = ['Age','Fare']

for col in scaled_cols:
    scaler = StandardScaler()
    dataset[[col]] = pd.DataFrame(
        data=scaler.fit_transform(dataset[[col]]),
        index=dataset.index,
        columns=[col]
    )

dataset.head()

In [None]:
X = dataset.drop('2urvived', axis=1)
y = dataset['2urvived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

print(X_train.shape, y_train.shape)

In [None]:
def svm_model_helper(X_train, y_train, X_test, y_test, kernel, degree): 
  model = SVC(kernel=kernel, degree=degree)
  model.fit(X_train, y_train)
  svm_predictions = model.predict(X_test)

  accuracy = model.score(X_test, y_test)
  f1 = f1_score(y_test, svm_predictions, average='macro')

  return [kernel, degree, accuracy, f1]

In [None]:
linear = svm_model_helper(X_train, y_train, X_test, y_test, 'linear', 1)
poly2 = svm_model_helper(X_train, y_train, X_test, y_test, 'poly', 2)
poly3 = svm_model_helper(X_train, y_train, X_test, y_test, 'poly', 3)
poly5 = svm_model_helper(X_train, y_train, X_test, y_test, 'poly', 5)
rbf = svm_model_helper(X_train, y_train, X_test, y_test, 'rbf', 1)
sigmoid = svm_model_helper(X_train, y_train, X_test, y_test, 'sigmoid', 1)

pd.DataFrame(
    columns=['kernel', 'degree', 'accuracy', 'f1_score'],
    data=[linear, poly2, poly3, poly5, rbf, sigmoid]
)

In [None]:
X_train_new = X_train[['Age', 'Fare']]
X_test_new = X_test[['Age', 'Fare']]

In [None]:
xr, yr = X_train_new.values, y_train.values
x_min, x_max = xr[:, 0].min() - 0.1, xr[:, 0].max() + 0.1
y_min, y_max = xr[:, 1].min() - 0.1, xr[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                    np.arange(y_min, y_max, h))
def plotter(kernel, degree = 0):
  poly3 = SVC(kernel=kernel, degree=degree)
  poly3.fit(X_train_new, y_train)
  poly3.fit(xr, yr)
  Z = poly3.predict(np.c_[xx.ravel(), yy.ravel()])
  Z = Z.reshape(xx.shape)
  plt.contourf(xx, yy, Z, cmap=plt.cm.coolwarm, alpha=0.8)
  plt.scatter(xr[:, 0], xr[:, 1], c=yr, cmap=plt.cm.coolwarm)
  plt.xlabel('Age')
  plt.ylabel('Fare')
  plt.xlim(xx.min(), xx.max())
  plt.ylim(yy.min(), yy.max())

In [None]:
# linear = SVC(kernel='linear')
# linear.fit(X_train_new, y_train)

# plt.scatter(
#     X_train_new['Age'], 
#     X_train_new['Fare'], 
#     c=y_train,
#     s=6
# )

plotter('linear')

In [None]:
plotter('poly', 2)

In [None]:
plotter('poly', 3)

In [None]:
plotter('sigmoid')

In [None]:
plotter('rbf')

In [None]:
def findOptimalC(X_train, y_train, X_test, y_test, start, end, step):
    c = start
    accuracies = []
    C_values = []
    while(c<=end):
        
        model = SVC(kernel='rbf', C=c, gamma=0.5)
        model.fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        
        C_values.append(c)
        accuracies.append(acc)
        c+=step
    
    return [accuracies, C_values]

In [None]:
result = findOptimalC(X_train, y_train, X_test, y_test, 0.001, 100, 0.5)
print(result)

In [None]:
result = findOptimalC(X_train, y_train, X_test, y_test, 0.001, 10, 0.1)
print(result)

In [None]:
result = findOptimalC(X_train, y_train, X_test, y_test, 0.001, 1, 0.005)
print(result)

In [None]:
ind = np.argmax(result[0], axis=0)
optimal_c = result[1][ind]
print("Optimal C:",optimal_c)
print("Accuracy:", result[0][ind])

In [None]:
def findOptimalG(X_train, y_train, X_test, y_test, C, start, end, step):
    g = start
    accuracies = []
    G_values = []
    while(g<=end):
        
        model = SVC(kernel='rbf', C=C, gamma=g)
        model.fit(X_train, y_train)
        acc = model.score(X_test, y_test)
        
        G_values.append(g)
        accuracies.append(acc)
        g+=step
    
    return [accuracies, G_values]

In [None]:
result = findOptimalG(X_train, y_train, X_test, y_test, optimal_c, 0.01, 10, 0.05)
print(result)

In [None]:
result = findOptimalG(X_train, y_train, X_test, y_test, optimal_c, 0.01, 1, 0.005)
print(result)

In [None]:
ind = np.argmax(result[0], axis=0)
optimal_g = result[1][ind]
print("Optimal G:",optimal_g)
print("Accuracy:", result[0][ind])

In [None]:
C_values = np.arange(0.01, 1, 0.05)
G_values = np.arange(0.01, 1, 0.05)

In [None]:
param_grid = {'C': C_values, 'gamma': G_values,'kernel': ['rbf']}
grid = GridSearchCV(SVC(),param_grid)
grid.fit(X_train,y_train)

In [None]:
print(grid.best_estimator_)

In [None]:
acc = grid.score(X_test, y_test)
print("Accuracy:", acc)