# Demo: SF Crimes

### Уменьшение обучающей выборки

In [54]:
import pandas

data = pandas.read_csv('./train.csv')

features_train = None
for category, objects in data.groupby('Category'):
    n = min(50, len(objects))

    rows = data.ix[objects.sample(n = n).index.values]

    if features_train is None:
        features_train = rows
    else:
        features_train = features_train.append(rows)

data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


### Удаление колонок, отсутствующих в тестовой выборке

In [55]:
from sklearn import preprocessing

def remove_test_columns(data):
    data_copy = data.copy()
    data_copy.drop(['Category', 'Descript', 'Resolution'], axis = 1, inplace = True)
    return data_copy


### Метод обучения градиентного бустинга

In [57]:
def search_boost_classifier(x, y):
    kf = cross_validation.StratifiedKFold(y, n_folds = 5, shuffle = True, random_state = 241)

    clf_search = grid_search.GridSearchCV(
        ensemble.GradientBoostingClassifier(),
        { 'n_estimators': [10, 20, 30] },
        cv = kf,
        scoring = 'log_loss',
        verbose = True
    )
    clf_search.fit(x, y)

    print(clf_search.best_estimator_)
    print(clf_search.best_score_)

## Обучение градиентного бустинга на сырых данных

In [56]:
from sklearn import cross_validation
from sklearn import grid_search
from sklearn import ensemble

le_category = preprocessing.LabelEncoder()
Y_train = le_category.fit_transform(features_train.Category.values)
    
X_train = remove_test_columns(features_train)
X_train.Dates = pandas.to_datetime(X_train.Dates).astype(int)
label_encoder = preprocessing.LabelEncoder()
X_train.DayOfWeek = label_encoder.fit_transform(X_train.DayOfWeek)
X_train.PdDistrict = label_encoder.fit_transform(X_train.PdDistrict)
X_train.Address = label_encoder.fit_transform(X_train.Address)

X_train.head()

Unnamed: 0,Dates,DayOfWeek,PdDistrict,Address,X,Y
154926,1366584000000000000,3,9,824,-122.413609,37.784697
831802,1061217600000000000,1,8,285,-122.455679,37.723702
747715,1096506720000000000,4,0,21,-122.40574,37.737417
633486,1148040000000000000,0,6,1196,-122.489539,37.772325
117512,1381962600000000000,6,5,1106,-122.435028,37.777747


In [58]:
search_boost_classifier(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   24.7s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
-3.68588133383


### Разбор дат

In [59]:
def build_dummy_dates(data):
    data_copy = data.copy()
    
    data_copy.Dates = pandas.to_datetime(data_copy.Dates)
    data_copy['Min'] = data_copy.Dates.apply(lambda x: x.minute)
    data_copy['Hour'] = data_copy.Dates.apply(lambda x: x.hour)
    data_copy['Day'] = data_copy.Dates.apply(lambda x: x.day)
    data_copy['Month'] = data_copy.Dates.apply(lambda x: x.month)
    data_copy['Year'] = data_copy.Dates.apply(lambda x: x.year)
    data_copy['WeekNumber'] = data_copy.Dates.apply(lambda x: x.isocalendar()[1])
    data_copy.drop('DayOfWeek', axis = 1, inplace = True)
    data_copy.drop('Dates', axis = 1, inplace = True)
    
    return data_copy

## Обучение градиентного бустинга с разобранными датами

In [61]:
X_train = remove_test_columns(features_train)
X_train = build_dummy_dates(X_train)
X_train.PdDistrict = label_encoder.fit_transform(X_train.PdDistrict)
X_train.Address = label_encoder.fit_transform(X_train.Address)

X_train.head()

Unnamed: 0,PdDistrict,Address,X,Y,Min,Hour,Day,Month,Year,WeekNumber
154926,9,824,-122.413609,37.784697,40,22,21,4,2013,16
831802,8,285,-122.455679,37.723702,40,14,18,8,2003,34
747715,0,21,-122.40574,37.737417,12,1,30,9,2004,40
633486,6,1196,-122.489539,37.772325,0,12,19,5,2006,20
117512,5,1106,-122.435028,37.777747,30,22,16,10,2013,42


In [62]:
search_boost_classifier(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   28.8s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
-3.61384864791


## Выделение признаков дня недели

In [64]:
X_train = pandas.concat([X_train, pandas.get_dummies(features_train.DayOfWeek)], axis = 1);
X_train.head()

Unnamed: 0,PdDistrict,Address,X,Y,Min,Hour,Day,Month,Year,WeekNumber,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday
154926,9,824,-122.413609,37.784697,40,22,21,4,2013,16,0,0,0,1,0,0,0
831802,8,285,-122.455679,37.723702,40,14,18,8,2003,34,0,1,0,0,0,0,0
747715,0,21,-122.40574,37.737417,12,1,30,9,2004,40,0,0,0,0,1,0,0
633486,6,1196,-122.489539,37.772325,0,12,19,5,2006,20,1,0,0,0,0,0,0
117512,5,1106,-122.435028,37.777747,30,22,16,10,2013,42,0,0,0,0,0,0,1


In [65]:
search_boost_classifier(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   34.8s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
-3.60585478249


## Выделение признаков департамента

In [67]:
X_train = pandas.concat([X_train, pandas.get_dummies(features_train.PdDistrict)], axis = 1); 
X_train.drop('PdDistrict', axis = 1, inplace = True)
X_train.head()

Unnamed: 0,Address,X,Y,Min,Hour,Day,Month,Year,WeekNumber,Friday,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
154926,824,-122.413609,37.784697,40,22,21,4,2013,16,0,...,0,0,0,0,0,0,0,0,0,1
831802,285,-122.455679,37.723702,40,14,18,8,2003,34,0,...,0,0,0,0,0,0,0,0,1,0
747715,21,-122.40574,37.737417,12,1,30,9,2004,40,0,...,1,0,0,0,0,0,0,0,0,0
633486,1196,-122.489539,37.772325,0,12,19,5,2006,20,1,...,0,0,0,0,0,0,1,0,0,0
117512,1106,-122.435028,37.777747,30,22,16,10,2013,42,0,...,0,0,0,0,0,1,0,0,0,0


In [68]:
search_boost_classifier(X_train, Y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   44.2s finished


GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=3, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)
-3.61097064667
