In [67]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
import download
import mlpipeline
import helper

In [75]:
params = {
    'to_exclude': ['year', 'label'],
    'top_x_percent': 0.1,
    'date_col': 'filing_year',
    'prediction_window': 12,
    'start_time': '2010-01-01',
    'end_time': '2016-01-01',
    'len_train': 36,
    'discrete_bins': 3,
    'cats': ['low', 'medium', 'high'],
    'outcome': 'label',
    'model_to_run': 'LR',
    'model_params': {
    'LR': {
        'C': 10,
        'class_weight': None,
        'penalty': 'l2',
        'fit_intercept': True,
        'intercept_scaling': 1,
        'max_iter': 100,
        'multi_class': 'warn',
        'n_jobs': None,
        'random_state': 1234,
        'solver': 'warn',
        'tol': 0.0001,
        'verbose': 0,
        'warm_start': False}}
        }

In [13]:
eviction = download.load_evict('../inputs/eviction_data_tract.csv')
crime = download.load_crime('../inputs/crime_by_tract.csv')
buildings = download.load_building('../inputs/building_violation_by_tract.csv')
acs = download.load_acs('../inputs/acs_year_tract.csv')
education = download.load_education('../inputs/educ_year_tract.csv')
tracts = download.load_tract('../inputs/ch_opdat/tracts.csv')
eviction_df = download.join_bases(eviction, acs, education, crime, buildings, tracts)

In [14]:
eviction_df = mlpipeline.create_label(eviction_df, 'year', 'eviction_filings_rate', 1 - params['top_x_percent'],
                                      params['prediction_window'])
eviction_df = eviction_df.drop(['eviction_filings_rate_next_year', 'next_year'], axis = 1)

In [108]:
train_set = eviction_df.loc[eviction_df['year'] != 2017].copy()
test_set = eviction_df.loc[eviction_df['year'] == 2017].copy()

In [109]:
cols_to_discretize = mlpipeline.get_continuous_variables(eviction_df)
cols_to_binary = []
for col in cols_to_discretize:
     cols_to_binary.append(col + "_group")

In [110]:
process_train = mlpipeline.process_df(train_set, cols_to_discretize, params['discrete_bins'],
                                      params['cats'], cols_to_binary)

perc_increase_total_crime_class_Violent Crime                             799
perc_increase_total_crime_class_Property Crime                            797
perc_increase_total_crime_class_Less serious offences_mean_by_commarea    796
perc_increase_bv_mean_by_commarea                                         796
perc_increase_bv                                                          796
perc_increase_total_crime_class_Less serious offences                     796
perc_increase_total_crime                                                 796
perc_increase_total_crime_class_Violent Crime_mean_by_commarea            796
perc_increase_total_crime_class_Property Crime_mean_by_commarea           796
perc_increase_total_crime_mean_by_commarea                                796
population_race_asian                                                      21
population_race_latinx                                                     21
population_race_other                                           

In [111]:
selected_features = list(process_train.loc[:,process_train.apply(lambda x: x.isin([0, 1]).all())].columns)
selected_features.remove('label')
predictors = selected_features

In [112]:
process_test = mlpipeline.process_df(test_set, cols_to_discretize, params['discrete_bins'],
                                      params['cats'], cols_to_binary)

race                                                              4
housing_units_other                                               4
population_race_asian                                             4
population_race_latinx                                            4
population_race_black                                             4
population_race_white                                             4
population_poverty_above                                          4
population_poverty_below                                          4
population_race_other                                             4
housing_units_rental                                              4
perc_increase_total_crime_class_Violent Crime                     1
total_primary_type_WEAPONS VIOLATION                              0
total_primary_type_OTHER OFFENSE                                  0
total_primary_type_PUBLIC INDECENCY                               0
total_primary_type_PUBLIC PEACE VIOLATION       

In [113]:
x_train = process_train[selected_features]
y_train = process_train['label']
x_test = process_test[selected_features]

In [137]:
lr_pred = mlpipeline.run_model(x_train, y_train, x_test, params['model_to_run'],
                               params['model_params'][params['model_to_run']])



In [115]:
# from sklearn.linear_model import LogisticRegression

In [116]:
# clf = LogisticRegression(C= 10,
#                         class_weight= None,
#                         penalty= 'l2',
#                         fit_intercept= True,
#                         intercept_scaling= 1,
#                         max_iter= 100,
#                         multi_class= 'warn',
#                         n_jobs= None,
#                         random_state= 1234,
#                         solver= 'warn',
#                         tol= 0.0001,
#                         verbose= 0,
#                         warm_start= False)
# clf.fit(x_train, y_train)
# y_pred_probs = clf.predict_proba(x_test)



In [119]:
test_set['predicted_score'] = y_pred_probs[:,1]

In [120]:
test_set.sort_values('predicted_score', inplace=True, ascending=False)
test_set['prediction'] = helper.generate_binary_at_k(test_set['predicted_score'], 10)

In [129]:
test_set.loc[test_set['prediction'] == 1, ['tract', 'year', 'prediction']]

Unnamed: 0,tract,year,prediction
4376,440101,2017,1
4303,431302,2017,1
4377,440102,2017,1
4381,440600,2017,1
6166,720300,2017,1
5734,670700,2017,1
5737,671100,2017,1
5738,671200,2017,1
2450,231500,2017,1
2452,836700,2017,1
