In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [None]:
train_raw = pd.read_csv('input/CSE7302c_train-1539668060821.csv', na_values='?')
train_raw.head()

In [None]:
test_raw = pd.read_csv('input/CSE7302c_test-1539668060821.csv', na_values='?')
test_raw.head()

In [None]:
train_raw.nunique()

In [None]:
test_raw.nunique()

In [None]:
## We have few columns that have unique values of less than 5, these are good candidates for being categorical.

In [None]:
train_raw.info()

In [None]:
test_raw.info()

In [None]:
## we know that 
# train size:  68636  observations  30 attributes
# test size:   29414 observations 30 attributes

# We have 68636 and 29414 unique ids in train and test respectively and this nominal data can be removed.
train_data = train_raw.drop('id', axis=1)
test_data = test_raw.drop('id', axis=1)
train_data.head()

In [None]:
test_data.head()

In [None]:
cat_cols = train_data.columns[train_data.nunique() <= 5]
print(cat_cols)
num_cols = train_data.columns[train_data.nunique() > 5]
num_cols

In [None]:
## type casting
train_data[cat_cols] = train_data[cat_cols].astype('category')
test_data[cat_cols] = test_data[cat_cols].astype('category')

In [None]:
train_data.dtypes

In [None]:
test_data.dtypes

In [None]:
## NA Values
train_data.isna().sum()

In [None]:
test_data.isna().sum()

In [None]:
## So we have NA values in test but not in train. Good to know.

In [None]:
## Ref: https://scikit-learn.org/stable/modules/impute.html#impute
## categorical data
cat_imputer = SimpleImputer(strategy='most_frequent')
test_data[cat_cols] = pd.DataFrame(cat_imputer.fit_transform(test_data[cat_cols]), columns=list(cat_cols))

In [None]:
test_data.isna().sum()

In [None]:
## Ref: https://scikit-learn.org/stable/modules/impute.html#knnimpute
## impute numerical data
num_imputer = KNNImputer()
test_data[num_cols] = pd.DataFrame(num_imputer.fit_transform(test_data[num_cols]), columns=num_cols)

In [None]:
test_data.isna().sum()

In [None]:
test_data.dtypes

In [None]:
## Let's drop 'class' from cat_cols since it's target variable
## Lets also drop 'jet4b.tag' for now since there's class imbalance in the variables
cat_cols = cat_cols.drop('class')
cat_cols = list(cat_cols)
cat_cols

In [None]:
x_train, y_train = train_data.drop(['class'], axis=1), train_data['class']
x_test, y_test = test_data.drop(['class', 'jet4b.tag'], axis=1), test_data['class']
print(x_train.shape)
print(x_test.shape)

In [None]:
dummy_x_train = pd.get_dummies(x_train, drop_first=True)
dummy_x_test = pd.get_dummies(x_test, drop_first=True)

In [None]:
print(dummy_x_train.shape)
dummy_x_train.head()

In [None]:
print(dummy_x_test.shape)
dummy_x_test.head()

In [None]:
model = SVC()

In [None]:
model.fit(dummy_x_train, y_train)

In [None]:
y_train_pred = model.predict(dummy_x_train)
print(f'train accuracy: {accuracy_score(y_train, y_train_pred)}')

In [None]:
y_test = test_data['class'].astype('category')
y_test_pred = model.predict(dummy_x_test)
print(f'test accuracy: {accuracy_score(y_test, y_test_pred)}')

In [None]:
prediction = pd.DataFrame(y_test_pred, columns=['predictions'])
id = pd.DataFrame(test_raw['id'], columns=['id'])
output = pd.concat([id, prediction], axis=1).to_csv('output/prediction.csv', index=None)