In [1]:
!pip install catboost==0.20.2

Collecting catboost==0.20.2
  Downloading catboost-0.20.2-cp37-none-manylinux1_x86_64.whl (63.9 MB)
[K     |████████████████████████████████| 63.9 MB 7.7 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.20.2


In [2]:
# Importing libraries
import pandas as pd
import numpy as np
import requests
from io import StringIO 
import datetime as dt
from catboost import CatBoostRegressor
import datetime as dt
import re
from fastai.tabular import *
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Loading datasets
sample_sub = pd.read_csv('Sample_sub.csv')
train = pd.read_csv('train.csv')
cap_cat = pd.read_csv('CaptureSite_category.csv')

ss = sample_sub.copy()

In [4]:
# function to add date features from the fast ai library
def add_datepart(df, fldname, drop=True):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
    df[targ_pre+'Elapsed'] = fld.astype(np.int64) // 10**9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [5]:
# Data wrangling to create training and testing datasets
sample_sub["year_woy"]=(sample_sub.ID.apply(lambda x: x.split("_")[-1])).astype(int)
sample_sub["CaptureSite"]=sample_sub.ID.apply(lambda x: ("_").join(x.split("_")[0:-1]))

# Create Time features from Date_TimeCaught 
train["Date_TimeCaught"]=pd.to_datetime(train["Date_TimeCaught"])
train["year"]=train.Date_TimeCaught.dt.year
train["week_of_year"]=train.Date_TimeCaught.dt.weekofyear
train["year_woy"]=train.year*100+train.week_of_year

# Concatinating week oy year and capture site in the training dataset to the submission dataset
# Grouping the data by capture site and taking the minimum week of year
keys=pd.concat([train[["year_woy","CaptureSite"]],sample_sub[["year_woy","CaptureSite"]]])
CaptureSite_min_year_woy=keys.groupby("CaptureSite").year_woy.min().rename("year_woy").reset_index()

# Creating an empty dataframe and adding the necessary columns
# Dropping duplicates
range_year_woy=pd.DataFrame()
range_year_woy["Date_TimeCaught"]=pd.date_range(start=train.Date_TimeCaught.min(),end='2019/10/31')
range_year_woy["year"]=range_year_woy.Date_TimeCaught.dt.year
range_year_woy["week_of_year"]=range_year_woy.Date_TimeCaught.dt.weekofyear
range_year_woy["year_woy"]=range_year_woy.year*100+range_year_woy.week_of_year
range_year_woy.drop_duplicates(["year_woy"],inplace=True)

# Joining the cleaned datasets together
final_data=[]
for site , year_woy in zip(CaptureSite_min_year_woy.CaptureSite.values,CaptureSite_min_year_woy.year_woy.values) :

    one_site_df=range_year_woy[range_year_woy.year_woy>=year_woy]
    one_site_df["CaptureSite"]=site
    final_data.append(one_site_df)
final_data=pd.concat(final_data)



# Extracting the target variable from the dataet
Target=train.groupby(["year_woy","CaptureSite"]).CaptureSite.count().rename("Capture_Number").reset_index()
final_data=final_data.merge(Target,on=["year_woy","CaptureSite"],how="left")
final_data.Capture_Number.fillna(0,inplace=True)

# Separating the training set and testing set
train=final_data[final_data.year<2019]
test=final_data[final_data.year==2019]

In [6]:
# Combining test and train to create features efficiently
target = train.Capture_Number

train['separator'] = 0
test['separator'] = 1

train, test = train.align(test, join = 'inner', axis = 1)

train_test = pd.concat([train, test])
train_test.drop(['year',	'week_of_year', 'Capture_Number'], axis = 1, inplace = True)

In [7]:
# Adding features
train_test = train_test.merge(cap_cat, how = 'left', on = 'CaptureSite')
train_test['id'] = [x + '_' + str(y) for x, y in zip(train_test.CaptureSite, train_test.year_woy)]

# Adding date features using functions from the fast ai library
add_datepart(train_test, 'Date_TimeCaught', False)
add_cyclic_datepart(train_test, 'Date_TimeCaught')

# Creating a list of categorical features
categorical_features = ['Date_TimeCaughtMonth',	'Date_TimeCaughtWeek',	'Date_TimeCaughtDay',	'Date_TimeCaughtDayofweek',\
           'CaptureSite', 'CaptureSiteCategory', 'Type']

# Converting categorical columns to category datatype
for col in categorical_features:
  train_test[col] = train_test[col].astype('category')

In [8]:
# Separating the training and testing datasets
train = train_test[train_test.separator == 0]
test = train_test[train_test.separator == 1]

# Dropping the separator column
train.drop('separator', axis = 1, inplace = True)
test.drop('separator', axis = 1, inplace = True)

# Adding target variable to check for correlations
train['target'] = list(target)

In [10]:
# Training and making predictions
# Dropping the id and target columns from the training set
X = train.drop(['id', 'target'], axis = 1)
y = train.target
tes = test.drop('id', axis = 1)

# Using catboost regressor to train.
cat = CatBoostRegressor(logging_level='Silent',cat_features=categorical_features, random_state = 101)
cat.fit(X, y)

# making predictions and creating a submission file
preds = cat.predict(tes)*1.229
sub_df = pd.DataFrame({'ID': test.id, 'Captured_Number': preds}) 
sub_df.to_csv('prodel119.csv', index = False)