# Classification using Logistic Regression

The variables related with the ticket amount (judge, fine amounts, discount),
the geographical position (lat,lon,address,zip_code), and the hearing waiting time will be used as features to distinguish between compliant and non-compliant tickets.

In [1]:
%matplotlib notebook
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from utils import *

In [2]:
preliminary_variables = ["disposition", "fine_amount","discount_amount", "judgment_amount", "compliance"]

In [3]:
df = pd.read_csv("train.csv",index_col="ticket_id", encoding = "ISO-8859-1",low_memory=False) 
df1= df[ preliminary_variables ].copy()
df1.dropna(inplace=True)

In [4]:
df_geoloc = df_geog_info(verbose=True)
df1 = df1.join(df_geoloc,how="left")
df1.dropna(inplace=True)
scalers_lat_lon = scale_lat_lon(df1)

rows in df with addresses and ticketsID: 311307
rows after joining by address with latlon df: 266319
rows with merged info (lat,lon, address) after Nas dropped: 266316


In [5]:
df1 = generate_street_zipcode(df1)
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138117 entries, 37852 to 284333
Data columns (total 12 columns):
disposition        138117 non-null object
fine_amount        138117 non-null float64
discount_amount    138117 non-null float64
judgment_amount    138117 non-null float64
compliance         138117 non-null float64
lat                138117 non-null float64
lon                138117 non-null float64
address            138117 non-null object
scaled_lat         138117 non-null float64
scaled_lon         138117 non-null float64
street             138117 non-null object
zip_code           138117 non-null object
dtypes: float64(8), object(4)
memory usage: 13.7+ MB


### Encoding  categorical features

In [6]:
df1 = encode_categorical(df1)

Variable zip_code:
- 17666 unique values
- max. binary representation: 0b100010100000010
- binary-encoded generated features: 16

Variable street:
- 1696 unique values
- max. binary representation: 0b11010100000
- binary-encoded generated features: 12

Variable address:
- 64932 unique values
- max. binary representation: 0b1111110110100100
- binary-encoded generated features: 17



## Split dataset for training/cv and test

In [7]:
target = 'compliance'
ff = ["compliance","lat","lon"]
features = df1.columns.drop(ff)
X_train, X_test, y_train, y_test = train_test_split(df1[features], df1[target], random_state=0,train_size=0.85)



### Scaling  features related with fines amounts
- The features related with fine amounts are normalized but not center around the mean.  

In [8]:
scalers_amounts = scale_fine_amounts(X_train)
X_train.drop(columns=["fine_amount","discount_amount","judgment_amount"],inplace=True) 

#### Outliers 

In [9]:
plt.subplots(3,1)
for i,j in enumerate(['scaled_discount_amount', 'scaled_fine_amount', 'scaled_judgment_amount']):
    outliers_fines = X_train[j].where( X_train[j]>15).dropna().index
    
    plt.subplot(3,1,i+1)
    sns.distplot(X_train[j],kde=False,norm_hist=False,bins=35)
    plt.gca().set_yscale("log")
    plt.grid(linestyle='--',axis="y")
    plt.gca().set_yticks([1e1,1e2,1e3,1e4,1e5])
    plt.tight_layout()
    plt.title(" number of cases with scaled amount > 15: {}".format( len(outliers_fines) ) )

<IPython.core.display.Javascript object>

### Outliers in geographical distribution (Remote points) 

All the data points with a lat-lot distance > 2 from the origin are dropped for the training:

In [10]:
origin_distance = lambda  x: np.sqrt( x["scaled_lat"]*x["scaled_lat"] + x["scaled_lon"]*x["scaled_lon"] )
outliers_geog = X_train.where( X_train.apply(origin_distance,axis=1) > 2 )
outliers_geog = outliers_geog.dropna().index
print("compliance values for outliers: {}".format(y_train.loc[outliers_geog].unique()))

compliance values for outliers: [0.0]
Categories (1, float64): [0.0]


In [11]:
plt.figure()
plt.scatter(X_train.scaled_lat,X_train.scaled_lon,marker='o',c="b",s=0.5,label="preserved values (origin distance <=2)")
plt.scatter(X_train.loc[outliers_geog].scaled_lat,X_train.loc[outliers_geog].scaled_lon,marker='*',c="r",s=1,label="outliers (origin distance > 2)")
plt.xlabel("scaled latitude")
plt.ylabel("scaled longitude")
plt.title("Number of outliers for training set: {}".format(X_train.loc[outliers_geog].count().unique()[0]   ))
plt.grid(linestyle='--')
plt.legend(loc="lower right")
plt.tight_layout()

<IPython.core.display.Javascript object>

### final features:

In [12]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117399 entries, 107479 to 116671
Data columns (total 54 columns):
address_0                       117399 non-null int64
address_1                       117399 non-null int64
address_2                       117399 non-null int64
address_3                       117399 non-null int64
address_4                       117399 non-null int64
address_5                       117399 non-null int64
address_6                       117399 non-null int64
address_7                       117399 non-null int64
address_8                       117399 non-null int64
address_9                       117399 non-null int64
address_10                      117399 non-null int64
address_11                      117399 non-null int64
address_12                      117399 non-null int64
address_13                      117399 non-null int64
address_14                      117399 non-null int64
address_15                      117399 non-null int64
address_16          

# Model development

In [14]:
logit = LogisticRegression()
grid_values = {'C': [1e-2,1e-1, 1, 10, 50], 'penalty':['l1','l2']}

### Logit using full X_train set

In [15]:
grid_clf = train_and_cross_validate(logit,X_train,y_train,grid_values,scoring="roc_auc")
print("best score: {}".format(grid_clf.best_score_))
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test))



--- 275.36456823349 seconds ---
best score: 0.784365166022268


NameError: name 'selected_features' is not defined

In [16]:
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test))

Unnamed: 0,Predicted 0,Predicted 1
True 0,0,19244
True 1,15,1459


In [29]:
feature_importances_logit(grid_clf,X_train)

Feature Importances:


{-0.22741530206635424: 'zip_code_4',
 -0.09243048986286799: 'zip_code_14',
 -0.07137554402353757: 'responsible_by_determination',
 -0.04643886935681358: 'responsible_by_default',
 -0.03504903371156633: 'scaled_fine_amount',
 -0.01673291559639852: 'zip_code_15',
 -0.014048177075117397: 'zip_code_2',
 -0.007186272045318873: 'street_7',
 -0.006837135900985471: 'address_9',
 0.007127651261680294: 'zip_code_12',
 0.008535540738080862: 'zip_code_8',
 0.018680063184995325: 'zip_code_3',
 0.039017088210676794: 'address_1',
 0.1762013720300823: 'address_2',
 0.19296197745584007: 'address_13'}

In [28]:
def feature_importances_logit(grid_clf,X_train,tolerance=5e-3):
    print("Feature Importances:")
    fi= dict(zip(grid_clf.best_estimator_.coef_[0],X_train.columns.to_numpy()))
    norm = np.sum([abs(i) for i in fi.keys() ])
    normed_keys = [i/norm for i in fi.keys()]
    fi = dict(zip( normed_keys  ,X_train.columns.to_numpy()))
    feature_importances = {}
    for i in fi.keys():
        if np.abs(i)>tolerance:
            feature_importances[i] = fi[i]
    #feature_importances= dict(zip(X_train.columns.to_numpy(),rf_clf.feature_importances_))
    return feature_importances

### Logit without outliers and F1-score

In [1]:
X_train_no_outl = X_train.drop(index=outliers_geog.append(outliers_fines))
grid_clf = train_and_cross_validate(model=logit,X_train=X_train_no_outl,y_train=y_train,parameters=grid_values,scoring="f1")
confusion_mat(y_true=y_test,y_pred=grid_clf.predict(X_test))

NameError: name 'X_train' is not defined

In [None]:
feature_importances_logit(grid_clf,X_train_no_outl)

### Logit without address and zip_code features

In [None]:
X_train_red = X_train.drop(columns=[i for i in X_train.columns.to_list() if "address" in i or "zip_code" in i])
train_and_cross_validate(model=logit,X_train=X_train_red,y_train=y_train,parameters=grid_values,scoring="roc_auc")