In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss


In [2]:
# Importing and Cleaning (artist name and track name)
import re

trainClean = pd.read_csv('../data/trainClean.csv').fillna(0)

def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)   
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)   
    text = re.sub(r'www.[^ ]+', '', text)  
    text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', '', text)  
    text = re.sub(r'[^a-zA-Z]', ' ', text)   
    text = [token for token in text.split() if len(token) > 2]
    text = ' '.join(text)
    return text

trainClean['artist_name'] = trainClean['artist_name'].apply(clean_text).str.strip().str.replace(' ', '_')
trainClean['track_name'] = trainClean['track_name'].apply(clean_text).str.strip().str.replace(' ', '_')

# Encoder
ord_enc = OrdinalEncoder()
trainClean['artist_name_code'] = ord_enc.fit_transform(trainClean[['artist_name']])

In [3]:
# Assignment

X = trainClean.drop(['artist_name', 'track_name', 'class'], axis=1)
y = trainClean['class']
cat_cols = ['artist_name', 'track_name']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20)

In [4]:
# Model testing
from xgboost.sklearn import XGBClassifier

model = XGBClassifier()
model.fit(X_train,y_train)
predictions = model.predict(X_test)

XGB_score = accuracy_score(y_test,predictions)
print(XGB_score)

0.5525


In [5]:
n_folds = 10
subbed = []
kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=10)


for fold, (train_idx, test_idx) in enumerate(kf.split(X_train, y_train)):
    print('=============== Fold No:',fold+1,'===============')
    X_tr, X_tst = X_train.iloc[train_idx], X_train.iloc[test_idx]
    y_tr, y_tst = y_train.iloc[train_idx], y_train.iloc[test_idx]
    
    model = CatBoostClassifier(n_estimators=10000, random_state=10, eval_metric= 'MultiClass', cat_features=cat_cols)
    model.fit(X_tr, y_tr,eval_set=[(X_tst, y_tst)], early_stopping_rounds=30, verbose=500)
    print(log_loss(y_tst, model.predict_proba(X_tst)))
    subbed.append(log_loss(y_tst, model.predict_proba(X_tst)))
    pred = model.predict_proba(X_test)
print(np.mean(subbed))



ValueError: 'artist_name' is not in list