In [1]:
import gc
import glob
import time
import os
import json
import matplotlib.pyplot as plt
import pprint
from sklearn.metrics import cohen_kappa_score
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
from scipy.sparse import hstack
from joblib import Parallel, delayed
import tqdm
from sklearn.metrics import make_scorer
from PIL import Image

from sklearn.linear_model import Ridge 
from sklearn.linear_model import Lasso 

%matplotlib inline


In [2]:
os.listdir('../input/test/')

['sample_submission.csv', 'test.csv']

In [3]:
train = pd.read_csv('../input/train/train.csv')
test = pd.read_csv('../input/test/test.csv')
sample_submission = pd.read_csv('../input/test/sample_submission.csv')

In [4]:
train.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed
0,2,Nibble,3,299,0,1,1,7,0,1,...,1,1,100,41326,8480853f516546f6cf33aa88cd76c379,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2
1,2,No Name Yet,1,265,0,1,1,2,0,2,...,1,1,0,41401,3082c7125d8fb66f7dd4bff4192c8b14,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0
2,1,Brisco,1,307,0,1,2,7,0,2,...,1,1,0,41326,fa90fa5b1ee11c86938398b60abc32cb,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3
3,1,Miko,4,307,0,2,1,2,0,2,...,1,1,150,41401,9238e4f44c71a75282e62f7136c6b240,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2
4,1,Hunter,1,307,0,1,1,0,0,2,...,1,1,0,41326,95481e953f8aed9ec3d16fc4509537e8,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2


In [5]:
test.head()

Unnamed: 0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PetID,PhotoAmt
0,1,Puppy,2,307,0,1,1,0,0,2,...,2,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,"Puppy is calm for a young dog, but he becomes ...",378fcc4fc,3.0
1,2,London,24,266,0,1,2,7,0,2,...,1,1,1,0,41326,4475f31553f0170229455e3c5645644f,0,Urgently seeking adoption. Please contact for ...,73c10e136,1.0
2,2,Snowball,20,266,0,2,7,0,0,2,...,1,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,Snowball... doesn't look so good (she is healt...,72000c4c5,1.0
3,2,Malibu,5,266,252,2,1,6,7,2,...,1,1,1,100,41326,4475f31553f0170229455e3c5645644f,0,"Malibu: Female, Local Mix, 4-5 months, vaccina...",e147a4b9f,1.0
4,1,Lala Girl,6,307,0,2,1,2,7,2,...,1,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,LALA! That's my name. I'm a 6 month old girl d...,43fbba852,1.0


In [6]:
# Inital formatting
y_train = train['AdoptionSpeed']
X_train = train.drop(columns = 'AdoptionSpeed')
X_test = test

In [7]:
X_train['Description'].fillna(' ',inplace = True)
X_test['Description'].fillna(' ',inplace = True)

In [8]:
X_train_desc =X_train['Description']
X_test_desc =X_test['Description']

### Filter unneccasary columns

In [9]:
X_train = X_train.drop(columns = ['Name','RescuerID','PetID','Description'])
X_test = X_test.drop(columns = ['Name','RescuerID','PetID','Description'])

### Table data to one hot

In [10]:
all_data = pd.concat((X_train,X_test))
for column in all_data.select_dtypes(include=[np.object]).columns:
    X_train[column] = X_train[column].astype('category', categories = all_data[column].unique())
    X_test[column] = X_test[column].astype('category', categories = all_data[column].unique())

In [11]:
X_train_encoded = pd.get_dummies(X_train)

In [12]:
X_test_encoded = pd.get_dummies(X_test)

### Gen trigram(words)

In [13]:
vectorizer = CountVectorizer(ngram_range =(1,3))
vectorizer.fit(X_train_desc)
X_train_description = vectorizer.transform(X_train_desc)
X_test_description = vectorizer.transform(X_test_desc)

In [14]:
X_train_description

<14993x727540 sparse matrix of type '<class 'numpy.int64'>'
	with 2366509 stored elements in Compressed Sparse Row format>

In [15]:
# add encoded columns
X_train_description_ =  hstack((X_train_description,X_train_encoded))
X_test_description_ =  hstack((X_test_description,X_test_encoded))
Selected_columns =[*vectorizer.get_feature_names(),*X_train_encoded.columns]

In [16]:
Selected_columns[len(Selected_columns)-19:]

['Type',
 'Age',
 'Breed1',
 'Breed2',
 'Gender',
 'Color1',
 'Color2',
 'Color3',
 'MaturitySize',
 'FurLength',
 'Vaccinated',
 'Dewormed',
 'Sterilized',
 'Health',
 'Quantity',
 'Fee',
 'State',
 'VideoAmt',
 'PhotoAmt']

### Helper Functions

In [17]:
#https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/quadratic_weighted_kappa.py
def confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat


def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def quadratic_weighted_kappa(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Calculates the quadratic weighted kappa
    quadratic_weighted_kappa calculates the quadratic weighted kappa
    value, which is a measure of inter-rater agreement between two raters
    that provide discrete numeric ratings.  Potential values range from -1
    (representing complete disagreement) to 1 (representing complete
    agreement).  A kappa value of 0 is expected if all agreement is due to
    chance.
    quadratic_weighted_kappa(rater_a, rater_b), where rater_a and rater_b
    each correspond to a list of integer ratings.  These lists must have the
    same length.
    The ratings should be integers, and it is assumed that they contain
    the complete range of possible ratings.
    quadratic_weighted_kappa(X, min_rating, max_rating), where min_rating
    is the minimum possible rating, and max_rating is the maximum possible
    rating
    """
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return 1.0 - numerator / denominator

### Lasso regression in folds for feature selection

In [18]:
lass = Lasso(alpha = 0.001,random_state=42) #0.001 shows best perfomance using gridsearch
lass.fit(X_train_description,y_train)

Lasso(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=42,
   selection='cyclic', tol=0.0001, warm_start=False)

In [71]:
coeff_001 = []
features_001 = []
for coefff,feat in zip(lass.coef_,vectorizer.get_feature_names()):
    if (coefff!=0):
        coeff_001.append(coefff)
        features_001.append(feat)

In [76]:
print("Number of features: ", len(features_001))

Number of features:  933


In [73]:
#Sort features by importance and take most significant
list1, list2 = (list(t) for t in zip(*sorted(zip(coeff_001, features_001))))

In [192]:
# Take 300 most important features
take_number_features = 300
features = list2[:(take_number_features // 2)] +list2[(len(list2)-(take_number_features // 2)):] 

In [193]:
# Add to resulting dataframe only most important features
coeff_001 = []
features_001 = []
col_number = 0 
df_after_lasso_train = pd.DataFrame()
df_after_lasso_test  = pd.DataFrame()
for coefff,feat in zip(lass.coef_,vectorizer.get_feature_names()):
    if (coefff!=0 and feat in features):
        coeff_001.append(coefff)
        df_after_lasso_train[feat] = X_train_description.getcol(col_number).toarray().ravel()
        df_after_lasso_test[feat] = X_test_description.getcol(col_number).toarray().ravel()
        features_001.append(feat)
    col_number+=1

In [194]:
df_after_lasso_encoded_train = pd.concat([X_train_encoded,df_after_lasso_train],axis = 1)
df_after_lasso_encoded_test = pd.concat([X_test_encoded,df_after_lasso_test],axis = 1)

### Ridge regression on all data to score

In [195]:
regr_ridge = GridSearchCV(Ridge(random_state=42), cv=3,param_grid={"alpha": [ 10,1,0.1,5,0.01,100,0.001]})
regr_ridge.fit(df_after_lasso_encoded_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'alpha': [10, 1, 0.1, 5, 0.01, 100, 0.001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [196]:
regr_ridge.best_score_

0.16265371883855637

In [197]:
regr_ridge.best_params_

{'alpha': 10}

In [182]:
regr_ridge.best_estimator_.fit(df_after_lasso_encoded_train,y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=42, solver='auto', tol=0.001)

In [183]:
test_predictions  = regr_ridge.best_estimator_.predict(df_after_lasso_encoded_test)

### Submission

In [184]:
submission = pd.DataFrame({'PetID': test['PetID'].values, 'AdoptionSpeed': test_predictions.astype(np.int32)})
submission.head()
submission.to_csv('submission.csv', index=False)