In [2]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import sklearn as sk

%config InlineBackend.figure_formats = {'png', 'retina'}

import matplotlib as mpl
import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D

import seaborn as sns
sns.set()
sns.set_style("whitegrid")
sns.set_color_codes()

pd.set_option('display.max_rows', 30000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

  from pandas.core import datetools


# Shelter Animal Outcomes
- 유기 동물 보호소에서 동물들이 어떤 결과를 가져올 것인가를 예측
- 데이터 크기는 작지만 주제가 흥미롭고, 외부 데이터를 사용해보거나, breed, color의 텍스트 데이터를 어떻게 처리할 것인가에 대한 고민이 주요포인트
- https://www.kaggle.com/c/shelter-animal-outcomes

In [5]:
df_train_animal = pd.read_csv("./shelter/all/train.csv")
print(df_train_animal.shape)
df_train_animal.tail()

(26729, 10)


Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
26724,A702446,,2015-05-14 11:56:00,Transfer,Partner,Cat,Intact Male,1 month,Domestic Shorthair Mix,Brown Tabby/White
26725,A718934,,2016-01-20 18:59:00,Transfer,SCRP,Cat,Spayed Female,3 months,Domestic Shorthair Mix,Brown Tabby
26726,A698128,Zeus,2015-03-09 13:33:00,Adoption,,Dog,Neutered Male,4 years,Old English Bulldog Mix,White/Tan
26727,A677478,,2014-04-27 12:22:00,Transfer,Partner,Cat,Intact Male,4 weeks,Domestic Shorthair Mix,Black
26728,A706629,,2015-07-02 09:00:00,Transfer,SCRP,Cat,Intact Male,1 year,Domestic Shorthair Mix,Brown Tabby/White


In [7]:
df_samplesub_animal = pd.read_csv("./shelter/all/sample_submission.csv")
print(df_samplesub_animal.shape)
df_samplesub_animal.tail()

(11456, 6)


Unnamed: 0,ID,Adoption,Died,Euthanasia,Return_to_owner,Transfer
11451,11452,1,0,0,0,0
11452,11453,1,0,0,0,0
11453,11454,1,0,0,0,0
11454,11455,1,0,0,0,0
11455,11456,1,0,0,0,0


# Walmart Recruiting: Trip Type Classification
- Here's the decryption key: Work4WalmarT
- 마트에 오는 customer의 product code를 가지고 고객의 triptype을 예측한다
- https://www.kaggle.com/c/walmart-recruiting-trip-type-classification

In [11]:
df_train_walmart = pd.read_csv("./walmart/train.csv")
print(df_train_walmart.shape)
df_train_walmart.tail()

(647054, 7)


Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
647049,39,191346,Sunday,32390000000.0,1,PHARMACY OTC,1118.0
647050,39,191346,Sunday,7874205000.0,1,FROZEN FOODS,1752.0
647051,39,191346,Sunday,4072.0,1,PRODUCE,4170.0
647052,8,191347,Sunday,4190008000.0,1,DAIRY,1512.0
647053,8,191347,Sunday,3800060000.0,1,GROCERY DRY GOODS,3600.0


- 각 TripType에 대한 proba를 제출

In [12]:
df_samplesub_walmart = pd.read_csv("./walmart/sample_submission.csv")
print(df_samplesub_walmart.shape)
df_samplesub_walmart.tail()

(95674, 39)


Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,TripType_15,TripType_18,TripType_19,TripType_20,TripType_21,TripType_22,TripType_23,TripType_24,TripType_25,TripType_26,TripType_27,TripType_28,TripType_29,TripType_30,TripType_31,TripType_32,TripType_33,TripType_34,TripType_35,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
95669,191338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95670,191339,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95671,191340,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95672,191341,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
95673,191348,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


# Forest Cover Type Prediction
- Roosevelt National Forest of northern Colorado의 30m X 30m의 숲 타입을 분류하는 문제
- https://www.kaggle.com/c/forest-cover-type-prediction/data

In [18]:
df_train_forest = pd.read_csv("./forest/train.csv")
print(df_train_forest.shape)
df_train_forest.tail()

(15120, 56)


Unnamed: 0,Id,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,Wilderness_Area1,Wilderness_Area2,Wilderness_Area3,Wilderness_Area4,Soil_Type1,Soil_Type2,Soil_Type3,Soil_Type4,Soil_Type5,Soil_Type6,Soil_Type7,Soil_Type8,Soil_Type9,Soil_Type10,Soil_Type11,Soil_Type12,Soil_Type13,Soil_Type14,Soil_Type15,Soil_Type16,Soil_Type17,Soil_Type18,Soil_Type19,Soil_Type20,Soil_Type21,Soil_Type22,Soil_Type23,Soil_Type24,Soil_Type25,Soil_Type26,Soil_Type27,Soil_Type28,Soil_Type29,Soil_Type30,Soil_Type31,Soil_Type32,Soil_Type33,Soil_Type34,Soil_Type35,Soil_Type36,Soil_Type37,Soil_Type38,Soil_Type39,Soil_Type40,Cover_Type
15115,15116,2607,243,23,258,7,660,170,251,214,1282,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
15116,15117,2603,121,19,633,195,618,249,221,91,1325,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
15117,15118,2492,134,25,365,117,335,250,220,83,1187,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
15118,15119,2487,167,28,218,101,242,229,237,119,932,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3
15119,15120,2475,197,34,319,78,270,189,244,164,914,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3


In [23]:
df_train_forest["Cover_Type"].unique()

array([5, 2, 1, 7, 3, 6, 4])