In [34]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, cohen_kappa_score, mean_absolute_error, mean_squared_error, make_scorer, accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.preprocessing import normalize
from catboost import CatBoostClassifier, Pool, cv, CatBoostRegressor
from glob import glob

from itertools import chain
from seaborn import heatmap
import ntpath, json

warnings.simplefilter(action='ignore', category=FutureWarning)

In [35]:
class MyPreProcess(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
        
    def fit( self, X, y = None  ):
        #print(X.columns)
        self.biggest_breeds_1 = list(X.Breed1.value_counts(normalize=True)[X.Breed1.value_counts(normalize=True)>0.1].index)
        self.biggest_breeds_2 = list(X.Breed2.value_counts(normalize=True)[X.Breed2.value_counts(normalize=True)>0.1].index)
        #return self
        
    def _classify_breed_1(self, breed):
        if breed in self.biggest_breeds_1:
            return breed
        return -1
    
    def _classify_breed_2(self, breed):
        if breed in self.biggest_breeds_2:
            return breed
        return -1
        
    def transform(self, X , y = None ):
        X_copy = X.copy()
        X_copy["BigBreed1"] = X_copy.Breed1.apply(self._classify_breed_1)
        X_copy["BigBreed2"] = X_copy.Breed2.apply(self._classify_breed_2)
        X_copy["IsMixed"] = X_copy.Breed2.map(lambda x: 0 if x==0 else 1)
        X_copy["NameLen"] = X_copy.Name.str.len()
        return X_copy

In [36]:
def get_pet_id(path):
    file_name = ntpath.basename(path)
    end = file_name.rfind('.json')
    return file_name[:end]

In [37]:
import os.path
def get_sentiment_df(folder):
    df = pd.DataFrame(columns = ["PetID", "SentimentMagnitude", "SentimentScore"])
    i=0
    pattern = os.path.join(folder, '*.json')
    for file_name in glob(pattern):
        with open(file_name, encoding="utf8", errors="ignore") as json_file:
            data = json.load(json_file)
            sentiment = data["documentSentiment"]
            df.loc[i] = [get_pet_id(file_name), sentiment["magnitude"], sentiment["score"]]
            i+=1
    return df

In [38]:
train = pd.read_csv("train/train.csv")
test = pd.read_csv("test/test.csv")

In [39]:
try:
    train_sentiment_df = pd.read_csv("train_sentiment.csv")
    test_sentiment_df = pd.read_csv("test_sentiment.csv")
except IOError:
    train_sentiment_df = get_sentiment_df('train_sentiment')
    test_sentiment_df = get_sentiment_df('test_sentiment')
    train_sentiment_df.to_csv("train_sentiment.csv", index=False)
    test_sentiment_df.to_csv("test_sentiment.csv", index=False)

In [40]:
train = pd.merge(train, train_sentiment_df, how="left")
test = pd.merge(test, test_sentiment_df, how="left")

In [41]:
X = train.drop(columns="AdoptionSpeed")
y = train.AdoptionSpeed

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [69]:
cat_features = X_train.select_dtypes("object").columns

In [70]:
mpp = MyPreProcess()
mpp.fit(X_train)
X_train_copy = mpp.transform(X_train).drop(columns=cat_features)
X_test_copy = mpp.transform(X_test).drop(columns=cat_features)

In [71]:
si = SimpleImputer()
si.fit(X_train_copy[["SentimentMagnitude", "SentimentScore"]])
X_train_copy[["SentimentMagnitude", "SentimentScore"]] = si.transform(X_train_copy[["SentimentMagnitude", "SentimentScore"]]).round(1)
X_test_copy[["SentimentMagnitude", "SentimentScore"]] = si.transform(X_test_copy[["SentimentMagnitude", "SentimentScore"]]).round(1)

In [72]:
X_train_copy.fillna(0, inplace=True)
X_test_copy.fillna(0, inplace=True)

In [73]:
gbc = GradientBoostingClassifier()

In [74]:
_ = gbc.fit(X_train_copy, y_train)

In [75]:
predictions = gbc.predict(X_test_copy)

In [87]:
new_columns = ['SentimentMagnitude', 'SentimentScore', 'BigBreed1', 'BigBreed2', 'IsMixed', 'NameLen']

In [102]:
new_features_df = X_train_copy.append(X_test_copy).sort_index()[new_columns]

In [104]:
new_features_df["PetID"] = train["PetID"]

In [109]:
new_features_df.to_csv("new_features.csv", index=False)