In [3]:
!wget -q https://github.com/YukiJudaiYubel/372Final/raw/main/Test.csv
!wget -q https://github.com/YukiJudaiYubel/372Final/raw/v2.0/Train.csv

In [63]:
import pandas as pd

# read the data from the csv files
xy_train = pd.read_csv('Train.csv', encoding='cp1252')
x_train = xy_train.drop(columns=['year'])
y_train = xy_train[['year']]

xy_test  = pd.read_csv('Test.csv', encoding='cp1252')
x_test = xy_test.drop(columns=['year'])
y_test = xy_test[['year']] 
testing_ids = x_test.ID


In [21]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import cohen_kappa_score, make_scorer

# define kappa_scorer method
kappa_scorer = make_scorer(cohen_kappa_score,weights="quadratic")

# preprocessing
np.random.seed(0)

numeric_features = ['nrgy', 'dnce']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ])

categorical_features = ['top genre']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', XGBClassifier(
                          objective='multi:softmax', seed=1))])

# select the features 
X_train = x_train[[*numeric_features, *categorical_features]]
X_test = x_test[[*numeric_features, *categorical_features]]


#apply the grid search
param_grid = {
    'preprocessor__num__imputer__strategy': ['mean'],
    'regressor__n_estimators': [150, 300],
    'regressor__max_depth':[0, 30]
}

grid_search = GridSearchCV(
    regr, param_grid, cv=6, verbose=3, n_jobs=2, 
    scoring= kappa_scorer)
grid_search.fit(X_train, y_train)

print('best score {}'.format(grid_search.best_score_))


Fitting 6 folds for each of 4 candidates, totalling 24 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  24 out of  24 | elapsed:   18.3s finished
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


best score 0.25007196417320227


In [5]:
# testing part
from sklearn.metrics import accuracy_score

In [6]:
def evaluate(model, your_x, your_y):
  y_p = model.predict(your_x)
  print('acc',max(accuracy_score(your_y, y_p),accuracy_score(your_y, y_p+1),accuracy_score(your_y, y_p-1)))

In [9]:
train_acc = evaluate(grid_search, x_train, y_train)



acc 0.9265734265734266


In [68]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test, test_size=.50)
valid_acc = evaluate(grid_search, x_valid, y_valid)

acc 0.5




In [69]:
test_acc = evaluate(grid_search, x_test, y_test)

acc 0.0


