In [None]:
import pandas as pd
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
import numpy as np

# Carregar dataset

Nessa etapa, carregamos o dataset de treino baixado do kaggle utilizando pandas e analisamos o formato dos dados, numero de instancias e o numero de classes.

In [100]:
data_original = pd.read_csv("train_original.csv", sep=",")
data_original.shape

(878049, 9)

Pelo tamanho do numero de instancias, fica dificil utilizar um classificador como SVM, pois o dataset eh muito grande e o treinamento demoraria muito. Uma boa escolha pode ser o modelo random forest.

In [101]:
data_original.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


Podemos quebrar a data em atributos como dia, mes e ano e utilizar como features no classificador.

In [103]:
data = pd.read_csv("train.csv", sep=",")

In [104]:
data.shape

(878049, 8)

### Dataset modificado com a data transformada em atributos

In [105]:
data.head()

Unnamed: 0,Year,Month,Day,Hour,Category,DayOfWeek,X,Y
0,2015,5,13,23,WARRANTS,Wednesday,-122.425892,37.774599
1,2015,5,13,23,OTHER OFFENSES,Wednesday,-122.425892,37.774599
2,2015,5,13,23,OTHER OFFENSES,Wednesday,-122.424363,37.800414
3,2015,5,13,23,LARCENY/THEFT,Wednesday,-122.426995,37.800873
4,2015,5,13,23,LARCENY/THEFT,Wednesday,-122.438738,37.771541


## Analise do numero de classes

O objetivo para esses dados eh prever a categoria do crime. Analisando o numero de classes dos dados, podemos perceber outro desafio em relacao a esse banco. Sao 39 o numero de classes, e bastante desbalanceadas, o que torna ambos undersampling e oversampling desafiadores.

In [106]:
np.unique(data["Category"])

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [107]:
len(np.unique(data["Category"]))

39

### Categoria de crime por numero de instancias

In [79]:
data.groupby('Category')['Category'].count().sort_values(ascending=True)

Category
TREA                                6
PORNOGRAPHY/OBSCENE MAT            22
GAMBLING                          146
SEX OFFENSES NON FORCIBLE         148
EXTORTION                         256
BRIBERY                           289
BAD CHECKS                        406
FAMILY OFFENSES                   491
SUICIDE                           508
EMBEZZLEMENT                     1166
LOITERING                        1225
ARSON                            1513
LIQUOR LAWS                      1903
RUNAWAY                          1946
DRIVING UNDER THE INFLUENCE      2268
KIDNAPPING                       2341
RECOVERED VEHICLE                3138
DRUNKENNESS                      4280
DISORDERLY CONDUCT               4320
SEX OFFENSES FORCIBLE            4388
STOLEN PROPERTY                  4540
TRESPASS                         7326
PROSTITUTION                     7484
WEAPON LAWS                      8555
SECONDARY CODES                  9985
FORGERY/COUNTERFEITING          10609
FRA

# Predicao categoria de crime

Primeiramente, treinamos random forest apenas nos dados de localizacao (atributos X e Y) por um esquema 80% treino 20% teste, feitos estratificadamente.

In [80]:
X_train, X_test, y_train, y_test = train_test_split(data[["X","Y"]], data["Category"], test_size=0.2, stratify=data['Category'])
scaler = MinMaxScaler()

In [81]:
clf = make_pipeline(MinMaxScaler(), RandomForestClassifier(n_estimators=150,  max_depth=20, n_jobs=4))

In [82]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('randomforestclassifier', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split...n_jobs=4,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

### Log Loss

Como a metrica de desempenho pedido pelo Kaggle eh o log loss, temos que pegar como saida do classificador as probabilidades previstas.

In [83]:
pred = clf.predict_proba(X_test)

In [84]:
pred.shape

(175610, 39)

In [85]:
log_loss(y_test, pred)

2.582384925671992

### Ajustando parametros

Como nao sabemos os melhores parametros para o modelos, iremos ajusta-lo no esquema 3-fold cross validation utilizando busca em grid. Primeiramente utilizaremos tambem apenas os dados das coordenadas X e Y. O pipeline que  implementamos na classe ClassifierPipeline, aplica scaling de 0 a 1 nos dados das coordenadas e uma busca em grid do classificador desejado. Atentar que o output da funcao eh o log loss negado porque a busca em grid sempre maximiza o valor do score por default, portanto valores que devem ser minimizados possuem seus valores negados.

In [4]:
from model_tuning import ClassifierPipeline

Nos eh retornado a melhor combinacao de parametros do grid colocado. O resultado foi um pouco melhor do que o parametro que colocamos na divisao 80/20.

In [6]:
rf_parameters = {
        "n_estimators":[150, 200, 250],
        "max_depth": [15, 20, 30],
        "max_features": ["sqrt", None]
}
pipeline = ClassifierPipeline(RandomForestClassifier(), rf_parameters, n_jobs=2)
pipeline.fit(df[["X","Y"]], df["Category"])


Best score: -2.479
Best parameters set:
	clf__max_depth: 15
	clf__max_features: 'sqrt'
	clf__n_estimators: 250


Eh possivel tambem visualizar os resultados para cada combinacao desejada.

In [7]:
pipeline.get_cv_results()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__max_depth,param_clf__max_features,param_clf__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,132.528298,5.677148,17.942146,0.184381,15,sqrt,150,"{'clf__max_depth': 15, 'clf__max_features': 's...",-2.516932,-2.432867,-2.494367,-2.481389,0.035525,3
1,166.923617,0.921443,23.629863,0.133542,15,sqrt,200,"{'clf__max_depth': 15, 'clf__max_features': 's...",-2.511662,-2.428701,-2.502686,-2.481017,0.037173,2
2,208.159656,0.467584,29.101714,0.254426,15,sqrt,250,"{'clf__max_depth': 15, 'clf__max_features': 's...",-2.508593,-2.429264,-2.500248,-2.479369,0.035593,1
3,161.03331,1.73346,18.139715,0.190154,15,,150,"{'clf__max_depth': 15, 'clf__max_features': No...",-2.611707,-2.528077,-2.58328,-2.574355,0.034721,6
4,215.645164,2.339813,23.571972,0.249849,15,,200,"{'clf__max_depth': 15, 'clf__max_features': No...",-2.59858,-2.522828,-2.572802,-2.564737,0.031447,5
5,266.804282,4.873287,29.088432,0.133918,15,,250,"{'clf__max_depth': 15, 'clf__max_features': No...",-2.594712,-2.516247,-2.564324,-2.558428,0.032303,4
6,129.337336,1.199092,20.441652,0.153489,20,sqrt,150,"{'clf__max_depth': 20, 'clf__max_features': 's...",-2.849239,-2.673548,-2.786783,-2.769858,0.072718,9
7,171.990314,0.294694,26.701926,0.18793,20,sqrt,200,"{'clf__max_depth': 20, 'clf__max_features': 's...",-2.810398,-2.655004,-2.759082,-2.741496,0.064647,8
8,215.552766,0.338737,32.820485,0.363415,20,sqrt,250,"{'clf__max_depth': 20, 'clf__max_features': 's...",-2.79107,-2.635812,-2.722775,-2.716554,0.063537,7
9,167.961983,1.452461,17.634393,2.204374,20,,150,"{'clf__max_depth': 20, 'clf__max_features': No...",-3.106964,-3.010061,-3.081494,-3.066174,0.041017,12


## Utilizando Dia da Semana e Mes
Agora, queremos utilizar tambem os dados temporais. Para isso, temos que transformar as colunas categoricas para numericas, no caso utilizaremos inicialmente dia da semana e mes.

In [87]:
tempdf = data[['Category','DayOfWeek', 'Month', 'X', 'Y']]
tempdf.head()

Unnamed: 0,Category,DayOfWeek,Month,X,Y
0,WARRANTS,Wednesday,5,-122.425892,37.774599
1,OTHER OFFENSES,Wednesday,5,-122.425892,37.774599
2,OTHER OFFENSES,Wednesday,5,-122.424363,37.800414
3,LARCENY/THEFT,Wednesday,5,-122.426995,37.800873
4,LARCENY/THEFT,Wednesday,5,-122.438738,37.771541


Binarizando as features categoricas utilizando uma funcao de pandas.

In [88]:
df = pd.get_dummies(tempdf, columns=['Month', 'DayOfWeek'])
df.head()

Unnamed: 0,Category,X,Y,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,...,Month_10,Month_11,Month_12,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,WARRANTS,-122.425892,37.774599,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,OTHER OFFENSES,-122.425892,37.774599,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,OTHER OFFENSES,-122.424363,37.800414,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,LARCENY/THEFT,-122.426995,37.800873,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,LARCENY/THEFT,-122.438738,37.771541,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


### Ajustando parametros
Fazendo busca em grid para os melhores parametros.

In [None]:
rf_parameters = {
        "n_estimators":[200, 300, 400],
        "max_depth": [5, 10, 15],
        "max_features": ["sqrt"]
}
pipeline = ClassifierPipeline(RandomForestClassifier(), rf_parameters, n_jobs=2)
pipeline.fit(df.drop(columns=["Category"]), df['Category'])