## Loading data

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline

In [2]:
df = pd.read_csv('train1.csv', index_col=0)
df.head(n=6)

Unnamed: 0,officer_id,officer_initials,race,sex,investigative_findings,cases_handled,complaint_summary,target
1,17BR0001,CV,White,Male,Other than Sustained Finding,25,Cn hf treated unprofessionally unknown th dist...,malfeasance
2,17BR0002,RB,Black,Female,Sustained Finding,18,Dh bm receive proper service officers kb ma ac...,lack of service
3,17BR0003,AD,White,Male,Other than Sustained Finding,7,Ak bm improperly stopped three unidentified th...,malfeasance
4,17BR0004,TS,Black,Female,Sustained Finding,3,Eg stated receive proper service officers ts c...,lack of service
5,17BR0005,DV,Black,Male,Other than Sustained Finding,35,Iv bf exhusband detective dv physically abused...,malfeasance
6,17BR0007,RB,Black,Male,Other than Sustained Finding,9,Ms behalf ls bm sgt jm follow proper police pr...,departmental violations


## Training data missing value treatment 

In [3]:
index = df[df['cases_handled'] == "UNK"]['cases_handled'].index
df.loc[index, 'cases_handled'] = 1
df['cases_handled'] = pd.to_numeric(df['cases_handled'])

index = df[df['race'] == "UNK"]['race'].index
df.loc[index, 'race'] = "White"

index = df[df['sex'] == "UNK"]['sex'].index
df.loc[index, 'sex'] = "Male"

In [4]:
X_train = df.drop(['officer_id', 'officer_initials', 'target'], axis=1)
y_train = df['target']

## Testing data missing value treatment

In [5]:
test = pd.read_csv('test1.csv', index_col=0)

In [6]:
index = test[test['cases_handled'] == "UNK"]['cases_handled'].index
test.loc[index, 'cases_handled'] = 1
test.loc[45, 'cases_handled'] = 5
test['cases_handled'] = pd.to_numeric(test['cases_handled'])

index = test[test['race'] == "UNK"]['race'].index
test.loc[index, 'race'] = "White"

index = test[test['sex'] == "UNK"]['sex'].index
test.loc[index, 'sex'] = "Male"

In [7]:
X_test = test.drop(['officer_id', 'officer_initials'], axis=1)

In [8]:
#LABELS = ['target']
NUMERIC_COLUMNS = ['cases_handled']
CATEGORICAL_COLUMNS = ['race', 'sex', 'investigative_findings']

def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + CATEGORICAL_COLUMNS):
    """ Takes the dataset as read in, drops the non-feature, non-text columns and
        then combines all of the text columns into a single vector that has all of
        the text for a row.
        
        :param data_frame: The data as read in with read_csv 
        :param to_drop: Removes the numeric and target label columns.
    """
    # dropping non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
   
    # replacing nans with blanks
    text_data.fillna("", inplace=True)
    
   # joining all of the text items in a row (axis=1) with a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [9]:
from sklearn.preprocessing import FunctionTransformer

get_text_data = FunctionTransformer(combine_text_columns, validate=False)
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False)
get_categorical_data = FunctionTransformer(lambda x: x[CATEGORICAL_COLUMNS], validate=False)

In [10]:
from features.SparseInteractions import SparseInteractions

In [11]:
from sklearn.feature_selection import chi2, SelectKBest

from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Imputer
from sklearn.feature_extraction.text import HashingVectorizer

from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import MaxAbsScaler

TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

In [12]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [13]:
%%time

# ignore deprecation warnings in sklearn
import warnings
warnings.filterwarnings("ignore")

# setting a reasonable number of features before adding interactions
chi_k = 15

# creating the pipeline object
pl = Pipeline([
        ('union', FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data)#,
                    #('imputer', Imputer())
                ])),
                
                ('categorical_features', Pipeline([
                    ('selector', get_categorical_data),
                    ('le', MultiColumnLabelEncoder())
                ])),
                
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', HashingVectorizer(token_pattern=TOKENS_ALPHANUMERIC,
                                                     non_negative=True, norm=None, binary=False,
                                                     ngram_range=(1, 2))),
                    ('dim_red', SelectKBest(chi2, chi_k))
                ]))
             ]
        )),
        ('int', SparseInteractions(degree=2)),
        ('scale', MaxAbsScaler()),
        ('clf', AdaBoostClassifier(DecisionTreeClassifier(), random_state = 42))
    ])

# fitting the pipeline to our training data
pl.fit(X_train, y_train.values)

# printing the score of our trained pipeline on our test set
#print("Logloss score of trained pipeline: ", log_loss_scorer(pl, X_test, y_test.values))

# Computing and printing accuracy
accuracy = pl.score(X_train, y_train)
print("\nAccuracy on test dataset: ", accuracy)


Accuracy on test dataset:  0.970238095238
Wall time: 11.4 s


In [14]:
# Making predictions
predictions = pl.predict(X_test)
#prob = pl.predict_proba(X_test)[:,1]
dt = {'target':predictions}
#temp = pd.DataFrame(np.array(predictions).reshape(100128,104))
# Formatting correctly in new DataFrame: prediction_df
prediction_df = pd.DataFrame(data=dt, index=test.officer_id)


# Saving prediction_df to csv called "predictions.csv"
prediction_df.to_csv("predictions12.csv")