In [None]:
import pandas as pd
import numpy as nps
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder





def blight_model():
    
    #import the data
    
    train_df=pd.read_csv('train.csv', engine='python')
    test_df = pd.read_csv('test.csv', engine='python')
    addresses = pd.read_csv('addresses.csv', encoding="latin1")
    latlons = pd.read_csv('latlons.csv', engine='python')
    
    
    #pre-processing the data
    
    #remove any compiance with NAN value
    train_df=train_df.dropna(subset=['compliance']) 
    
    #merge the address and the latlons
    train_df = pd.merge(train_df, pd.merge(addresses, latlons, on='address'), on='ticket_id')
    test_df = pd.merge(test_df, pd.merge(addresses, latlons, on='address'), on='ticket_id')    
    
    #drop columns not in the testing file
    train_df.drop(['payment_amount' , 'balance_due', 'payment_date', 
                   'payment_status', 'collection_status' , 'compliance_detail'],axis=1, inplace=True)
    
    #drop columns with unrelated data 
    train_df.drop(['agency_name', 'inspector_name', 'violator_name', 'non_us_str_code','ticket_issued_date', 'hearing_date',
                'mailing_address_str_number', 'mailing_address_str_name', 'ticket_issued_date', 'clean_up_cost' , 'violation_description', 
                'grafitti_status', 'state_fee', 'admin_fee', 'ticket_issued_date','violation_zip_code', 'hearing_date',
                  'violation_street_number', 'zip_code','violation_street_name', 'city' , 'state', 'country', 'address' ],axis=1, inplace=True)
   


    #encoding any string into an integer
    le = LabelEncoder()
    le.fit(train_df['disposition'].append(test_df['disposition'], ignore_index=True))
    train_df['disposition'] = le.transform(train_df['disposition'])
    test_df['disposition'] = le.transform(test_df['disposition'])   

    le = LabelEncoder()
    le.fit(train_df['violation_code'].append(test_df['violation_code'], ignore_index=True))
    train_df['violation_code'] = le.transform(train_df['violation_code'])
    test_df['violation_code'] = le.transform(test_df['violation_code'])
    
    
    #fill any NAN with the mean value
    train_df['lat'] = train_df['lat'].fillna(train_df['lat'].mean())
    train_df['lon'] = train_df['lon'].fillna(train_df['lon'].mean())
    test_df['lat'] = test_df['lat'].fillna(test_df['lat'].mean())
    test_df['lon'] = test_df['lon'].fillna(test_df['lon'].mean())
    
    #drop columns from test not found in training 
    train_columns = list(train_df.columns.values)
    train_columns.remove('compliance')
    test_df = test_df[train_columns]
    
    #train the model
    X_train, X_test, y_train, y_test = train_test_split(train_df.ix[:, train_df.columns != 'compliance'], train_df['compliance'])
    regr_rf = RandomForestRegressor()
    
    grid_values = {'n_estimators': [10, 100], 'max_depth': [None, 30, 50]}
    grid_clf_auc = GridSearchCV(regr_rf, param_grid=grid_values, scoring='roc_auc')
    grid_clf_auc.fit(X_train, y_train)
    #print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
    #print('Grid best score (AUC): ', grid_clf_auc.best_score_)

    
    
    return pd.DataFrame(grid_clf_auc.predict(test_df), test_df.ticket_id)
blight_model()