In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats as stats

In [2]:
plt.rcParams['figure.figsize'] = (14,5)
plt.rcParams["font.family"] = "monospace"
plt.rcParams['axes.edgecolor'] = 'black'

plt.rcParams['figure.frameon'] = True
plt.rcParams['axes.linewidth'] = 1.5

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("train.csv").drop(columns="id")
df.head()

Unnamed: 0,bone_length,rotting_flesh,hair_length,has_soul,color,type
0,0.354512,0.350839,0.465761,0.781142,clear,Ghoul
1,0.57556,0.425868,0.531401,0.439899,green,Goblin
2,0.467875,0.35433,0.811616,0.791225,black,Ghoul
3,0.776652,0.508723,0.636766,0.884464,black,Ghoul
4,0.566117,0.875862,0.418594,0.636438,green,Ghost


In [4]:
print(f"The dataset has {df.shape[0]} rows and {df.shape[1]} columns.")

The dataset has 371 rows and 6 columns.


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371 entries, 0 to 370
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   bone_length    371 non-null    float64
 1   rotting_flesh  371 non-null    float64
 2   hair_length    371 non-null    float64
 3   has_soul       371 non-null    float64
 4   color          371 non-null    object 
 5   type           371 non-null    object 
dtypes: float64(4), object(2)
memory usage: 17.5+ KB


In [6]:
df["color"]=df.color.astype("category", copy=False)

In [7]:
df.color.value_counts()

white    137
clear    120
green     42
black     41
blue      19
blood     12
Name: color, dtype: int64

In [None]:
df.describe()

In [None]:
# sns.pairplot(df, hue='type');

In [None]:
for i, v in enumerate(df.columns[:4],1):
    plt.subplot(1,4,i)
    sns.ecdfplot(x=v, data=df, hue="type")
    plt.suptitle("ECDF Plots")
    sns.despine()

# 1.$\textbf{ANOVA on Various Features.}$

The ANOVA test has important assumptions that must be satisfied in order for the associated p-value to be valid.

1. The samples are independent.
2. Each sample is from a normally distributed population.
3. The population standard deviations of the groups are all equal. This property is known as homoscedasticity.

#### 1.2 $\textbf{Check Normality on Each Monster For All Features.}$

$H_0: X \sim \mathcal{N}(\mu,\,\sigma^{2})$

In [None]:
for monster, feat in zip(df.type.unique(), df.columns[:4]):

    monster_type = df[df["type"]==monster][[feat]]
    
    print(f"Normality of {monster} on {feat} Feature.")

    _, p_val = stats.normaltest(monster_type)
    if p_val <.05:
        print("P:Value:",p_val, "- Reject Null Hypotesis")
    else:
        print("P:Value:",p_val, "- Cannot Reject Null Hypotesis")
        
    print("\n")

#### 1.3 Check Equal Variances on Each Monster For All Features.
We can check for homoscedasticity by using the Bartlett’s test instead of Levene’s test because the samples there are not  significant deviations from normality.

$H_0:$ $\sigma_1$ = $\sigma_2$ = $\sigma_3$

In [None]:
for i in df.columns[:4]:
    
    print(f"Bartlett's Test on {i} Feature.")
    
    ghoul = df[df["type"]=="Ghoul"][i]
    goblin = df[df["type"]=="Goblin"][i]
    ghost = df[df["type"]=="Ghost"][i]
    
    _, p_val = stats.bartlett(ghoul, goblin, ghost)
    if p_val <.05:
        print("P:Value:",p_val, "- Reject Null Hypotesis")
    else:
        print("P:Value:",p_val, "- Cannot Reject Null Hypotesis")
        
    print("Variance:", [np.round(np.var(x, ddof=1),4) for x in [ghoul, goblin, ghost]])
    print("\n")

Note: The variance of Ghoul on Rotting Flesh is about 1.5 less than other monsters.

#### 1.4 ANOVA Test.
$H_0:$ $\mu_1$ = $\mu_2$ = $\mu_3$

In [None]:
for i in df.columns[:4]:
    
    print(f"ANOVA on f{i} Feature.")
    
    ghoul = df[df["type"]=="Ghoul"][[i]]
    goblin = df[df["type"]=="Goblin"][[i]]
    ghost = df[df["type"]=="Ghost"][[i]]
    
    _, p_val = stats.f_oneway(ghoul, goblin, ghost)
    if p_val <.05:
        print(p_val, ": Reject Null Hypotesis")
    else:
        print(p_val, ": Cannot Reject Null Hypotesis")
        
    print("\n")

# Classification Task

In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures, LabelEncoder, KBinsDiscretizer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.calibration import CalibratedClassifierCV

In [10]:
X = df.drop(columns=["type"])
y = df.type

In [11]:
lb = LabelEncoder()

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    train_size=.85, 
                                                    random_state=0, 
                                                    stratify=y)

In [13]:
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)

In [14]:
num_cols = [col for col in X.columns if df[col].dtype!="category"]
num_cols

['bone_length', 'rotting_flesh', 'hair_length', 'has_soul']

In [16]:
# Numerical Columns
num_pipe = make_pipeline(PolynomialFeatures(), 
                         StandardScaler()
                        )


col_t = make_column_transformer(
                                (num_pipe, num_cols),
                                (OneHotEncoder(sparse=False), ["color"]))

In [41]:
def preprocessing(*args):
    
    num_pipe = make_pipeline(PolynomialFeatures(), 
                             StandardScaler(),
                             args
                            )


    col_t = make_column_transformer(
                                (num_pipe, num_cols),
                                (OneHotEncoder(sparse=False), ["color"]))
    
    

In [21]:
def model_fit(pipe, param_grid):
        
    grid = GridSearchCV(pipe, 
                        param_grid=param_grid, 
                        n_jobs=-1, 
                        scoring='accuracy', 
                        error_score="raise", 
                        cv=5)

    grid.fit(X_train, y_train)

    print("Best Params:", grid.best_params_)
    print("Best Score:", np.abs(grid.best_score_))

In [19]:
clf = LogisticRegression(fit_intercept=False, 
                         n_jobs=-1, 
                         random_state=0)

f_pipe = make_pipeline(col_t, clf)

C = np.linspace(.001, 1, 15)
param_grid = {"logisticregression__C": C, 
             "logisticregression__multi_class":["multinomial", "ovr"]}

## Classes are .33 each one so no need to adjust weights.

#model_fit(pipe = f_pipe, param_grid=param_grid)

# Best Params: {'logisticregression__C': 0.6432142857142857, 
#               'logisticregression__multi_class': 'multinomial'}
    
# Best Score: 0.7682539682539683

In [25]:
num_pipe = make_pipeline(PolynomialFeatures(), 
                         PCA(n_components=.9), 
                         StandardScaler(), 
                         KBinsDiscretizer(encode="onehot-dense"))

col_t = make_column_transformer(
                                (num_pipe, num_cols),
                                (OneHotEncoder(sparse=False), ["color"]))

param_grid = {"logisticregression__C": C, 
             "logisticregression__multi_class":["multinomial", "ovr"], 
             "columntransformer__pipeline__kbinsdiscretizer__n_bins":np.arange(4,10)}

f_pipe = make_pipeline(col_t, clf)

#model_fit(pipe=f_pipe, param_grid=param_grid)

# Best Params: {'columntransformer__pipeline__kbinsdiscretizer__n_bins': 9, 
#               'logisticregression__C': 0.9286428571428571, 
#               'logisticregression__multi_class': 'ovr'}
    
# Best Score: 0.7396825396825396

In [None]:
clf = LinearDiscriminantAnalysis()

results = cross_val_score(f_pipe, 
                          X_train, 
                          y_train, 
                          scoring='accuracy', 
                          cv=5, 
                          n_jobs=-1,
                          error_score="raise")

print("Mean Accuracy Score:", np.mean(results)) # 0.7365079365079364

In [27]:
clf = RandomForestClassifier(random_state=0, n_jobs=-1, warm_start=True)

param_grid = {"randomforestclassifier__max_depth": np.arange(3, 7), 
             "randomforestclassifier__n_estimators": np.arange(100, 400, 100), 
             "randomforestclassifier__max_samples": np.linspace(.5, 1, 3), 
             "randomforestclassifier__min_samples_leaf": np.arange(1,5)}

num_pipe = make_pipeline(PolynomialFeatures(), 
                         PCA(n_components=.9), 
                         StandardScaler())

col_t = make_column_transformer(
                                (num_pipe, num_cols),
                                (OneHotEncoder(sparse=False), ["color"]))

f_pipe = make_pipeline(col_t, clf)

model_fit(pipe=f_pipe, param_grid=param_grid)

# Best Params: {'randomforestclassifier__max_depth': 4, 
#               'randomforestclassifier__max_samples': 0.75, 
#               'randomforestclassifier__min_samples_leaf': 2, 
#               'randomforestclassifier__n_estimators': 200}
    
# Best Score: 0.7619047619047619

Best Params: {'randomforestclassifier__max_depth': 5, 'randomforestclassifier__max_samples': 0.5, 'randomforestclassifier__min_samples_leaf': 4, 'randomforestclassifier__n_estimators': 100}
Best Score: 0.7333333333333334


In [None]:
rf = RandomForestClassifier(random_state=0, 
                  n_jobs=-1, 
                  max_depth=4, 
                  max_samples=.75, 
                  min_samples_leaf=2, 
                  n_estimators=200, 
                  max_features=None)

clf = CalibratedClassifierCV(rf, n_jobs=-1)

results = cross_val_score(f_pipe, 
                          X_train, 
                          y_train, 
                          scoring='accuracy', 
                          cv=5, 
                          n_jobs=-1,
                          error_score="raise")

print("Mean Accuracy Score:", np.mean(results)) # 0.7746031746031745

In [None]:
logreg = LogisticRegression(C=.64,
                            fit_intercept=False, 
                            n_jobs=-1, 
                            random_state=0)

knn = clf = KNeighborsClassifier(n_jobs=-1, n_neighbors=4)

rf = RandomForestClassifier(random_state=0, 
                  n_jobs=-1, 
                  max_depth=4, 
                  max_samples=.75, 
                  min_samples_leaf=2, 
                  n_estimators=200, 
                  max_features=None)

ccf = CalibratedClassifierCV(rf, n_jobs=-1)

estimators = [("knn", knn), ("ccf", ccf)]

clf = StackingClassifier(estimators, final_estimator=logreg, n_jobs=-1)

f_pipe.fit(X_train, y_train)

In [None]:
test = pd.read_csv("test.csv")
id_ = test.id
test = test.drop(columns=["id"])

In [None]:
y_pred = f_pipe.predict(test)
y_pred = lb.inverse_transform(y_pred)

In [None]:
# submission = pd.DataFrame({"type":y_pred}, index=id_)
# submission.to_csv('submission.csv')