In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, mean_absolute_error, mean_squared_error, make_scorer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

from itertools import chain
from seaborn import heatmap

warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
class SmallCategoriesEraser(BaseEstimator):
    def __init__(self):
        return
        
    def fit( self, X, y = None  ):
        self.biggest_breeds = list(X.Breed1.value_counts(normalize=True)[X.Breed1.value_counts(normalize=True)>0.1].index)
        return self
        
    def _classify_breed(self, breed):
        if breed in self.biggest_breeds:
            return breed
        return -1
        
    def transform(self, X , y = None ):
        X_copy = X.copy()
        X_copy.Breed1 = X_copy.Breed1.apply(self._classify_breed)
        return X_copy

In [8]:
def get_score(pipeline, types, score = cohen_kappa_score):
    pipeline.fit(X_train[X_train.Type.isin(types)], y_train[X_train.Type.isin(types)])
    y_pred = pipeline.predict(X_test[X_test.Type.isin(types)])
    return round(score(y_pred, y_test[X_test.Type.isin(types)]),3)

In [9]:
def try_models(*steps):
    models = [
        KNeighborsClassifier(),
        DecisionTreeClassifier(random_state=1),
        RandomForestClassifier(random_state=1),
        AdaBoostClassifier(random_state=1),
        GradientBoostingClassifier(random_state=1)
        ]
    for model in models:
        print("<<<<----------------------------------------------------------------------------->>>>")
        print(type(model))
        pipeline = make_pipeline(*steps + (model,))
        print("Model quadratic weighted kappa score for dogs:", get_score(pipeline, [1]))
        print("Model quadratic weighted kappa score for cats:", get_score(pipeline, [2]))

In [10]:
def get_pet_id(path):
    file_name = ntpath.basename(path)
    end = file_name.rfind('.json')
    return file_name[:end]

In [11]:
import os.path
def get_sentiment_df(folder):
    df = pd.DataFrame(columns = ["PetID", "SentimentMagnitude", "SentimentScore"])
    i=0
    pattern = os.path.join(folder, '*.json')
    for file_name in glob(pattern):
        with open(file_name, encoding="utf8", errors="ignore") as json_file:
            data = json.load(json_file)
            sentiment = data["documentSentiment"]
            df.loc[i] = [get_pet_id(file_name), sentiment["magnitude"], sentiment["score"]]
            i+=1
    return df

In [12]:
train = pd.read_csv("train/train.csv")
test = pd.read_csv("test/test.csv")

In [13]:
try:
    train_sentiment_df = pd.read_csv("train_sentiment.csv")
    test_sentiment_df = pd.read_csv("test_sentiment.csv")
except IOError:
    train_sentiment_df = get_sentiment_df('train_sentiment')
    test_sentiment_df = get_sentiment_df('train_sentiment')
    train_sentiment_df.to_csv("train_sentiment.csv", index=False)
    test_sentiment_df.to_csv("test_sentiment.csv", index=False)

In [14]:
train = pd.merge(train, train_sentiment_df, how="left")
test = pd.merge(test, train_sentiment_df, how="left")

In [15]:
X = train.drop(columns="AdoptionSpeed")
y = train.AdoptionSpeed

categorical_features = list(X.select_dtypes("object").columns)
numerical_features = [col for col in X if col not in categorical_features]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [16]:
numerical_features_with_categoric_meaning = ["Breed1", "Breed2", "Gender", "Color1", "Color2", "Color3", "Vaccinated", "Dewormed", "Sterilized","State"] 

In [17]:
column_transformer = make_column_transformer((SimpleImputer(), [col for col in numerical_features if col not in numerical_features_with_categoric_meaning]),
                                              (OneHotEncoder(handle_unknown="ignore"), numerical_features_with_categoric_meaning))         

In [18]:
steps = [SmallCategoriesEraser(), column_transformer]

In [19]:
try_models(*steps)

<<<<----------------------------------------------------------------------------->>>>
<class 'sklearn.neighbors._classification.KNeighborsClassifier'>
Model quadratic weighted kappa score for dogs: 0.156
Model quadratic weighted kappa score for cats: 0.084
<<<<----------------------------------------------------------------------------->>>>
<class 'sklearn.tree._classes.DecisionTreeClassifier'>
Model quadratic weighted kappa score for dogs: 0.14
Model quadratic weighted kappa score for cats: 0.096
<<<<----------------------------------------------------------------------------->>>>
<class 'sklearn.ensemble._forest.RandomForestClassifier'>
Model quadratic weighted kappa score for dogs: 0.237
Model quadratic weighted kappa score for cats: 0.152
<<<<----------------------------------------------------------------------------->>>>
<class 'sklearn.ensemble._weight_boosting.AdaBoostClassifier'>
Model quadratic weighted kappa score for dogs: 0.163
Model quadratic weighted kappa score for cats

We see that the performance for each case is pretty different, among all models. That gives us the intuition that we should not apply the same model for both. Random Forest and GradientBoosting give so far the best results for both.

In [18]:
pipeline = make_pipeline(SmallCategoriesEraser(), column_transformer, RandomForestClassifier())

In [19]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 40, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5]
min_samples_leaf = [1, 2]
bootstrap = [True, False]
param_grid = {'randomforestclassifier__n_estimators': n_estimators,
               'randomforestclassifier__max_features': max_features,
               'randomforestclassifier__max_depth': max_depth,
               'randomforestclassifier__min_samples_split': min_samples_split,
               'randomforestclassifier__min_samples_leaf': min_samples_leaf,
               'randomforestclassifier__bootstrap': bootstrap}

In [20]:
kappa_scorer = make_scorer(cohen_kappa_score)

In [21]:
rf_random = RandomizedSearchCV(estimator = pipeline, param_distributions = param_grid, n_iter = 5, cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring=kappa_scorer)

In [22]:
search1 = rf_random.fit(X_train[X_train.Type==1], y_train[X_train.Type==1])
search2 = rf_random.fit(X_train[X_train.Type==2], y_train[X_train.Type==2])

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.4min finished


Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  1.2min finished


The best params are equal between the cases:

In [23]:
search1.best_params_, search2.best_params_

({'randomforestclassifier__n_estimators': 1200,
  'randomforestclassifier__min_samples_split': 5,
  'randomforestclassifier__min_samples_leaf': 2,
  'randomforestclassifier__max_features': 'auto',
  'randomforestclassifier__max_depth': None,
  'randomforestclassifier__bootstrap': True},
 {'randomforestclassifier__n_estimators': 1200,
  'randomforestclassifier__min_samples_split': 5,
  'randomforestclassifier__min_samples_leaf': 2,
  'randomforestclassifier__max_features': 'auto',
  'randomforestclassifier__max_depth': None,
  'randomforestclassifier__bootstrap': True})

In [26]:
get_score(search1.best_estimator_, [1,2])

0.196