<a href="https://colab.research.google.com/github/anniebbii/bork/blob/master/mlpc.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# take care of missing values?
# identify outliers? (missed comma, extra 0 etc.)
# standardize inputs?

# data before cleaning:
# y is names ['Bob' 'Atsuto' 'Jörg' ' ooh'] only one ' ooh'
# x5 is true/false ['False' 'True' '?' 'F' nan], two '?', one 'F', one nan
# x6 is letters ['F' 'E' 'A' 'D' 'B' 'Fx' 'C' '-0.46960' nan] only one '-0.4...', one nan

# import data as pandas dataframe
#df = pd.read_csv("TrainOnMe.csv")
url = 'https://raw.githubusercontent.com/anniebbii/mlpc/main/TrainOnMe.csv'
df = pd.read_csv(url)

# remove rows with weird entries since they're few and we got enough data
df = df.drop(df[df.y == ' ooh'].index)
df = df.drop(df[df.x5 == '?'].index)
df = df.drop(df[df.x5 == 'F'].index)
df = df.drop(df[df.x6 == '-0.46960'].index)
#df = df.dropna(axis=0, how='any')  # drop rows containing any null values

# encode categorical variables
df = df.replace('True', 1)
df = df.replace('False', 0)
df = df.replace('F', -1)
df = df.replace('Fx', 0)
df = df.replace('E', 1)
df = df.replace('D', 2)
df = df.replace('C', 3)
df = df.replace('B', 4)
df = df.replace('A', 5)
# encode labels
df = df.replace('Atsuto', 0)
df = df.replace('Bob', 1)
df = df.replace('Jörg', 2)

# change column types
df = df.astype({"x1": float, "x2": float})

#print(df.head())
#print(df.info())
#print(df.describe())

# separate features and labels
# separate train/validation and test data
X = df.iloc[:, 2:]
y = df.iloc[:, 1]

X_ttest = df.iloc[-100:, 2:]
y_ttest = df.iloc[-100:, 1]

# order in a dmatrix for xgboost
dm = xgb.DMatrix(data=X, label=y)

# split into train and validation data for cross validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=123)

# create an xgboost model
xg_model = xgb.XGBClassifier(objective ='multi:softprob', colsample_bytree = 0.6000000000000001, learning_rate = 0.7919793520294113, max_depth = 3, alpha = 3, n_estimators = 22)
#alpha=3, colsample_bytree=0.6000000000000001, learning_rate=0.7919793520294113, max_depth=3, n_estimators=22
# fit model to data
xg_model.fit(X_train, y_train)

# predict on validation dataset using trained model
preds = xg_model.predict(X_val)

# calculate accuracy
accuracy = accuracy_score(y_val, preds)
print("model accuracy", accuracy)

params = {"objective": "multi:softprob", 'colsample_bytree': 0.5, 'learning_rate': 0.8, 'max_depth': 5, 'alpha': 4, "num_class": 3}

cv_results = xgb.cv(dtrain=dm, params=params, nfold=10,
                    num_boost_round=3000, early_stopping_rounds=10, metrics="merror", as_pandas=True, seed=123)

#print(cv_results.tail(1))

# accuracy is 87%
# train the model on whole training set
xg_model.fit(X, y)


# clean test data
#for key in test:
#    print(test[key].unique())
# seems pretty clean

#print(test.info()) # it's freeking purrfect
#X_test = test.iloc[:, 1:]
#print(X_test.head())


from scipy import stats
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.metrics import accuracy_score

x = X

clf_xgb = XGBClassifier(objective = 'multi:softprob')
param_dist = {'learning_rate': [.25],
              'max_depth': [5],
              'alpha': [3],
              'n_estimators': [20, 25, 30],
              'colsample_bytree': [.5, .55, .6]}
clf = GridSearchCV(clf_xgb, param_grid = param_dist, scoring = 'accuracy', error_score = 0, verbose = 0, n_jobs = -1)

numFolds = 10
folds = KFold(n_splits = numFolds, shuffle = True)

estimators = []
results = np.zeros(len(X))
score = 0.0
for train_index, test_index in folds.split(X):                                                  # for each k
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]                               # create training set T_k
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()     # and validation set V_k
    clf.fit(X_train, y_train)                                                                   # do grid search CV

    estimators.append(clf.best_estimator_)
    results[test_index] = clf.predict(X_test)
    score += accuracy_score(y_test, results[test_index])
score /= numFolds

print("score:", score)


model accuracy 0.865
score: 0.8765858585858586


In [None]:
for i in estimators:
  print(i)

In [13]:
scores = []
for model in estimators:
    numFolds = 10
    folds = KFold(n_splits = numFolds, shuffle = True)
    results = np.zeros(len(X))
    score = []
    for train_index, test_index in folds.split(X):                                                  # for each k
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]                               # create training set T_k
        y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()     # and validation set V_k
        model.fit(X_train, y_train)                                                                   # do grid search CV
        results[test_index] = model.predict(X_test)
        score.append(accuracy_score(y_test, results[test_index]))
    scores.append([np.mean(score), np.std(score)])

for i in scores:
  print(i)


[0.8716161616161615, 0.03458710348934814]
[0.8765858585858586, 0.04240920368370538]
[0.8715959595959596, 0.04495811516480105]
[0.8695959595959597, 0.03451567030764494]
[0.8756363636363635, 0.032753163471270656]
[0.8766767676767676, 0.033125783290671784]
[0.8725656565656565, 0.034032346257293976]
[0.8796969696969696, 0.039282541276992974]
[0.8776868686868686, 0.02766741897701537]
[0.8666060606060606, 0.04245766360332607]


In [39]:
clf_xgb = XGBClassifier(objective = 'multi:softprob')
param_dist = {'learning_rate': [.8],
              'max_depth': [5],
              'alpha': [5],
              'n_estimators': [25],
              'colsample_bytree': [0.4, 0.5, 0.6]}

clf = GridSearchCV(clf_xgb, param_grid = param_dist, scoring = 'accuracy',  # create grid-search with 5-fold CV
                   error_score = 0, verbose = 0, n_jobs = -1)

numFolds = 10
folds = KFold(n_splits = numFolds, shuffle = True)

scores = []
for train_index, test_index in folds.split(X):                                                  # for each k
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]                               # create training set T_k
    y_train, y_test = y.iloc[train_index].values.ravel(), y.iloc[test_index].values.ravel()     # and validation set V_k
    clf.fit(X_train, y_train)                                                                   # do grid search CV

    estimator = clf.best_estimator_                                                             # then pick best model
    preds = estimator.predict(X_test)                                                                 # and test its accuracy
    score = accuracy_score(y_test, preds)                                                       # on validation data
    scores.append(score)

np.mean(scores)

0.8736565656565656

In [40]:
np.std(scores)

0.040306873060518396

[{'cv': None,
  'error_score': 0,
  'estimator': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
                colsample_bynode=1, colsample_bytree=1, gamma=0,
                learning_rate=0.1, max_delta_step=0, max_depth=3,
                min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
                nthread=None, objective='multi:softprob', random_state=0,
                reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
                silent=None, subsample=1, verbosity=1),
  'estimator__base_score': 0.5,
  'estimator__booster': 'gbtree',
  'estimator__colsample_bylevel': 1,
  'estimator__colsample_bynode': 1,
  'estimator__colsample_bytree': 1,
  'estimator__gamma': 0,
  'estimator__learning_rate': 0.1,
  'estimator__max_delta_step': 0,
  'estimator__max_depth': 3,
  'estimator__min_child_weight': 1,
  'estimator__missing': None,
  'estimator__n_estimators': 100,
  'estimator__n_jobs': 1,
  'estimator__nthread': None,
  'estimator__o