In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix 

from sklearn import tree
from matplotlib import pyplot as plt 
import random
from sklearn import preprocessing

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)
pd.set_option('mode.chained_assignment', None)

In [3]:
pd.set_option('display.max_columns', 500)

In [4]:
def run_single_geography_model(data_df, predict_df, df_cols, predict_cols, name):
    """
    Note that predict_cols[0] must be equal to the binary gentrification varible
    """
    # drop na
    d = {}
    for i in predict_cols:
        d[i] = predict_df[i]
    # merge into new df
    df = data_df.join(pd.DataFrame(
        #index="geo_fips",
        data=d
    ))
    s = df.shape
    df = df.dropna()
    print("previous to dropping", s, "after dropping", df.shape)
    new_predict = pd.DataFrame()
    for i in predict_cols:
        new_predict[i] = df[i]
    del data_df
    # new data_df
    data_df = pd.DataFrame()
    for i in df_cols + ["geo_fips"]:
        data_df[i] = df[i]
    print(new_predict.shape, data_df.shape)
    assert new_predict.shape[0] == data_df.shape[0], "after removing nulls, predict and data don't have the same # of rows"
    # normal
    for i in predict_cols:
        print("=============================================================")
        if "disp" in i.lower():
            print(i, "regression")
            model, stats = train_model(data_df[df_cols], new_predict, df_cols, i, "regression")
            diff = stats.loc[(stats['diff'] >= stats["diff"].quantile(0.01)) & (stats['diff'] <= stats["diff"].quantile(0.99))]["diff"]
            print("description")
            print(stats['diff'].describe())
            #plt.hist(stats["diff"])
            plt.hist(diff)
            plt.show()
        else:
            print(i, "classification")
            model, stats, confusion_matrix = train_model(data_df[df_cols], new_predict, df_cols, i, "classification")
            print("accuracy score", stats)
            print("confusion matrix")
            print(confusion_matrix)
            output_csv(model, data_df, df_cols, new_predict[i], "Normal set", name)
    # equal number of gentrified and non-gent geometries 
    original_data = data_df.copy()
    original_predict = new_predict.copy()
    temp_table = data_df.copy()
    temp_table[predict_cols[0]] = new_predict[predict_cols[0]]
    temp_table = temp_table.loc[temp_table[predict_cols[0]] == 1]
    gent_indicies = temp_table.index
    other_indicies = set(data_df.index) - set(gent_indicies)
    non_gent_selected_indicies = random.sample(list(other_indicies), len(gent_indicies))
    assert len(non_gent_selected_indicies) == len(gent_indicies), "gent data is not split 50/50"
    print(list(non_gent_selected_indicies)[:5], type(non_gent_selected_indicies), type(gent_indicies))
    gent_indicies = list(gent_indicies)
    data_df = data_df.loc[non_gent_selected_indicies + gent_indicies]
    new_predict = new_predict.loc[non_gent_selected_indicies + gent_indicies]
    #validation = 
    print("=============================================================")
    print("=============================================================")
    print("=============================================================")
    print("=====================50/50 split=============================")
    for i in predict_cols:
        print("=============================================================")
        if "disp" in i.lower():
            print(i, "regression", "50/50 split")
            model, stats = train_model(data_df[df_cols], new_predict, df_cols, i, "regression")
            diff = stats.loc[(stats['diff'] >= stats["diff"].quantile(0.01)) & (stats['diff'] <= stats["diff"].quantile(0.99))]["diff"]
            print("description")
            print(stats['diff'].describe())
            #plt.hist(stats["diff"])
            plt.hist(diff)
            plt.show()
        else:
            print(i, "classification" , "50/50 split")
            model, stats, confusion_matrix = train_model(data_df[df_cols], new_predict, df_cols, i, "classification")
            print("accuracy score", stats)
            print("confusion matrix")
            print(confusion_matrix)
            output_csv(model, original_data, df_cols, original_predict[i], "Even split" ,name)

        

In [5]:
def optimize_classification(x1_train, y1_train):
    def helper(depth):
        #clf = tree.DecisionTreeClassifier(max_depth=50, criterion="entropy")
        clf = tree.DecisionTreeClassifier(max_depth=depth, criterion="entropy")
        clf = clf.fit(x1_train, y1_train)
        z = clf.predict(x1_train)
        stats = accuracy_score(z, y1_train)
        matrix = confusion_matrix(y1_train, z) 
        return clf, stats, matrix
    max_depth = None
    clf = None
    max_accuracy = 0
    max_determinant = 0
    for i in range(10,50,5):
        model, acc, matrix = helper(i)
        #if acc > max_accuracy:
        det = np.linalg.det(matrix)
        if det > max_determinant:
            clf = model
            max_accuracy = acc
            max_depth = i
            max_determinant = det
    return clf, max_depth
    

In [6]:
def optimize_regression(x1_train, y1_train):
    def helper(depth):
        clf = tree.DecisionTreeRegressor(max_depth=depth, criterion="entropy")
        clf = clf.fit(x1_train, y1_train)
        z = clf.predict(x1_train)
        stats = accuracy_score(z, y1_train)
        matrix = confusion_matrix(y1_train, z) 
        return clf, stats, matrix
    max_depth = None
    clf = None
    max_accuracy = 0
    max_determinant = 0
    for i in range(10,50,5):
        model, acc, matrix = helper(i)
        #if acc > max_accuracy:
        det = np.linalg.det(matrix)
        if det > max_determinant:
            clf = model
            max_accuracy = acc
            max_depth = i
            max_determinant = det
    return clf, max_depth
    

In [7]:
def train_model(df, predict_df, df_cols, predict_col, model_type):
    min_max_scaler = preprocessing.MinMaxScaler()
    df_norm = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df_cols)
    X = df_norm
    Y = predict_df[predict_col]
    #x1_train, x1_val, y1_train, y1_val = train_test_split(X, Y, test_size=0.30)
    x1_train, x1_val, y1_train, y1_val = train_test_split(X, Y, test_size=0.33, random_state=42)
    print("size of training set", str(y1_train.shape), "size of testing set", len(y1_val), "original", str(Y.shape))
    # clean Data
    x1_train, y1_train = clean_data(x1_train, y1_train)
    
    if model_type == "regression":
        clf = tree.DecisionTreeRegressor() # max_depth=50
        #clf = clf.fit(x1_train._get_numeric_data(), y1_train)
        clf = clf.fit(x1_train, y1_train)
        #clf, max_depth = optimize_regression(x1_train, y1_train)
        z = clf.predict(x1_val)
        stats = pd.DataFrame({"predicted": z, "actual": y1_val, "diff": (y1_val - z)})
        return clf, stats
    else:
        #clf = tree.DecisionTreeClassifier(max_depth=50, criterion="entropy")
        #clf = clf.fit(x1_train, y1_train)
        clf, max_depth = optimize_classification(x1_train, y1_train)
        z = clf.predict(x1_val)
        #tree.plot_tree(clf)
        from sklearn.externals.six import StringIO  
        from IPython.display import Image  
        from sklearn.tree import export_graphviz
        import pydotplus
        dot_data = StringIO()
        export_graphviz(clf, out_file=dot_data,  
                        filled=True, rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
        display(Image(graph.create_png()))
        
        stats = accuracy_score(z, y1_val)
        confusion = confusion_matrix(y1_val, z) 
        return clf, stats, confusion

In [8]:
def confusion(a, b):
    """
    a must be from data, b must be from model
    """
    assert a in [0, 1] and b in [0, 1], "not binary 0/1"
    if a and b:
        #print("TP")
        return 0
    elif a and not b:
        #print("FN")
        return 1
    elif not a and b:
        #print("FP")
        return 2
    elif not a and not b:
        #print("TN")
        return 3

In [9]:
def output_csv(clf, data, df_cols, predict, split_type, name):
    z = clf.predict(data[df_cols])
    #stats = accuracy_score(z, predict)
    #print(data.columns)
    #fips = data["geo_fips"]
    #print(data["geo_fips"])
    df = pd.DataFrame({"actual": predict, "predicted": z, "geo_fips": data["geo_fips"]})
    arr = []
    for index, row in df.iterrows():
        #print(row.get("geo_fips"))
        #print(df.loc[index,'Qty'] == 1)
        """
        val = None;
        TP = df.loc[index,'actual'] == 1 and df.loc[index,'predicted'] == 1 #np.sum(np.logical_and(pred_labels == 1, true_labels == 1))
        TN = df.loc[index,'actual'] == 0 and df.loc[index,'predicted'] == 0
        #np.sum(np.logical_and(pred_labels == 0, true_labels == 0))
        FP = df.loc[index,'actual'] == 1 and df.loc[index,'predicted'] == 0
        #np.sum(np.logical_and(pred_labels == 1, true_labels == 0))
        FN = df.loc[index,'actual'] == 0 and df.loc[index,'predicted'] == 1
        #np.sum(np.logical_and(pred_labels == 0, true_labels == 1))
        if TP:
            val = 0
        elif TN:
            val = 3
        elif FP:
            val = 2
        elif FN:
            val = 1
        arr.append(val)
        """
        arr.append(confusion(df.loc[index,'actual'], df.loc[index,'predicted']))
    df["false_positive"] = arr
    
    df.to_csv("output/" + name + " " + split_type + ".csv")

In [10]:
def clean_data(x_train, y_train):
    #print(type(x_train), type(y_train))
    x_train["y_train"] = y_train
    first = x_train["y_train"].quantile(0.1)
    second = x_train["y_train"].quantile(0.9)
    df = x_train.loc[(x_train['y_train'] >= first) & (x_train['y_train'] <= second)]
    #print(df.columns)
    y_train = df["y_train"]
    df = df.drop(columns=["y_train"])
    return df, y_train

In [11]:
def stack_data(df1, cols1, df2, cols2, predict, predict_cols1, predict_cols2):
    assert len(cols1) == len(cols2), "input sizes are not the same"
    df1 = df1[cols1]
    df2 = df2[cols2]
    stacked_data = np.vstack((df1.values, df2.values))
    predict1 = predict[predict_cols1]
    predict2 = predict[predict_cols2]
    stacked_predict = np.vstack((predict1.values, predict2.values))
    stacked_data = pd.DataFrame(stacked_data, columns=cols1)
    stacked_predict = pd.DataFrame(stacked_predict, columns=predict_cols1)
    return stacked_data, stacked_predict

In [12]:
def stack_data_diff_geographies(df1, cols1, df2, cols2, predict1, predict2, predict_cols1, predict_cols2):
    assert len(cols1) == len(cols2), "input sizes are not the same"
    df1 = df1[cols1]
    df2 = df2[cols2]
    stacked_data = np.vstack((df1.values, df2.values))
    predict1 = predict1[predict_cols1]
    predict2 = predict2[predict_cols2]
    stacked_predict = np.vstack((predict1.values, predict2.values))
    stacked_data = pd.DataFrame(stacked_data, columns=cols1)
    stacked_predict = pd.DataFrame(stacked_predict, columns=predict_cols1)
    return stacked_data, stacked_predict

In [20]:
memphis_data = pd.read_stata('clean_Memphis_merge_081319.dta')
pd.Series(memphis_data.columns).values

array(['trtid10', 'hinc_00', 'pop_00', 'nhwhite_00', 'nhblk_00',
       'asian_00', 'hisp_00', 'hh_00', 'hu_00', 'ohu_00', 'rhu_00',
       'per_nonwhite_00', 'per_nhblk_00', 'per_hisp_00', 'per_asian_00',
       'col_00', 'per_col_00', 'per_carcommute_00', 'per_rent_00',
       '_merge_00', 'mrent_90', 'mhval_90', 'rentocc_90', 'ownocc_90',
       '_merge_90', 'pop_90', 'nhwhite_90', 'nhblk_90', 'asian_90',
       'hisp_90', 'hh_90', 'hinc_90', 'hu_90', 'ohu_90', 'rhu_90',
       'per_nonwhite_90', 'per_nhblk_90', 'per_hisp_90', 'per_asian_90',
       'col_90', 'per_col_90', 'per_carcommute_90', 'per_rent_90',
       'units_pre50_90', 'per_units_pre50_90', '_merge_17', 'pop_17',
       'pop_17_se', 'nhwhite_17', 'hh_17', 'hinc_17', 'hinc_17_se',
       'hu_17', 'ohu_17', 'rhu_17', 'hu_17_se', 'rhu_17_se', 'mrent_17',
       'mrent_17_se', 'mhval_17', 'mhval_17_se', 'per_nonwhite_17',
       'per_nhblk_17', 'per_hisp_17', 'per_asian_17', 'col_17',
       'per_col_17', 'per_carcommute_1

In [None]:
ny_00_cols = ["TOD", "downtown", "per_asian_00", "mhval00", "empd02", "hh00", "hinc00", "hu_00", "per_black_00", "per_built_90_00", "per_car_commute_00", "per_col00", "per_hhwchild_00", "per_latino_00", "per_nonwhite00", "per_owners_00", "per_rent00", "per_units_pre50", "pop00"]
ny_90_cols = ["TOD", "downtown", "per_asian_90", "mhval90", "empd02", "hh_90", "hinc90", "hu_90", "per_black_90", "per_built_80_90", "per_car_commute_90", "per_col90", "per_hhwchild_90", "per_latino_90", "per_nonwhite90", "per_owners_90", "per_rent90", "per_units_pre50", "pop90"]

#Keeping track of which variables I can find in Memphis dataset and not
sf_00_cols = ["tod", "emp_density00", "per_built90_00", "per_hhwchild_00"]
sf_90_cols = ["tod",  "emp_density90", "per_built80_90", "per_hhwchild_90"]

mem_00_cols = ['per_nonwhite_00',"downtown","per_asian_00","mhval00","hh00", "hinc00", "per_hu_00", "per_nhblk_00", "per_carcommute_00", "per_col_00","hhwchild_00", "per_hisp_00", "per_own_00", "per_rent_00", "pop00",'per_units_pre50_00']
mem_90_cols = ['per_nonwhite_90',"downtown","per_asian_90","mhval90", "hh90", "hinc90", "per_hu_90", "per_nhblk_90", "per_carcommute_90","per_col_90", "hhwchild90","per_hisp_90", "per_own_90", "per_rent_90", "pop90",'per_units_pre50_90' ]


In [None]:
ny_00_df = pd.read_excel("UDP_NY_2016_7.13.19_selectedNYNJcounties.xlsx", sheet_name="2000 data")
ny_90_df = pd.read_excel("UDP_NY_2016_7.13.19_selectedNYNJcounties.xlsx", sheet_name="1990 data")

sf_00_df = pd.read_excel("UDP_SF_2015_6.6.19.xlsx", sheet_name="2000 data")
sf_90_df = pd.read_excel("UDP_SF_2015_6.6.19.xlsx", sheet_name="1990 data")

mem_df = pd.read_stata('clean_Memphis_merge_081319.dta')
#to_predict_mem = pd.read_stata('clean_Memphis_merge_081319.dta')


to_predict_ny = pd.read_excel("UDP_NY_2016_7.13.19_selectedNYNJcounties.xlsx", sheet_name="to predict")

# put these 2 collumn value lists in the same order
predict_ny_90_cols = ["gent90_00", "Disp_index_90_00_count", "Disp_index_90_00_pctch_count", "Disp_index_90_00_pctch_per"]
predict_ny_00_cols = ["gent00_16", "Disp_index_00_16_count", "Disp_index_00_15_pctch_count", "Disp_index_00_15_pctch_per"]

predict_sf_90_cols = ["gent90_00_v2", "Disp_index_90_00_count", "Disp_index_90_00_pctch_count", "Disp_index_90_00_pctch_per"]
predict_sf_00_cols = ["gent00_15_v2", "Disp_index_00_15_count", "Disp_index_00_15_pctch_count", "Disp_index_00_15_pctch_per"]

#What these would look like in theory, currently have nothing to input (nothing to predict on)#
predict_mem_90_cols = ["gent90_00", "Disp_index_90_00_count", "Disp_index_90_00_pctch_count", "Disp_index_90_00_pctch_per"]
predict_mem_00_cols = ["gent00_16", "Disp_index_00_16_count", "Disp_index_00_15_pctch_count", "Disp_index_00_15_pctch_per"]

to_predict_sf = pd.read_excel("UDP_SF_2015_6.6.19.xlsx", sheet_name="to predict")
#predict_ny_00_cols = ["gent00_15_v2", "Disp_index_90_00_count", "Disp_index_90_00_pctch_count", "Disp_index_90_00_pctch_per", "Disp_index_00_15_count", "Disp_index_00_15_pctch_count", "Disp_index_00_15_pctch_per", "Gent_index_90_00", "Gent_index_00_15"]



In [None]:
#what this WOULD look like for Memphis
run_single_geography_model(mem_df, to_predict_mem, mem_90_cols, predict_mem_90_cols, "1990s Memphis")

In [None]:
memphis_data = pd.read_stata('clean_Memphis_merge_081319.dta')

In [None]:
mem_type = pd.read_csv('typology_input.csv')

In [None]:
memphis = pd.read_stata('memphis_typology_081319.dta')

In [None]:
pd.read_excel('UDP_NY_2016_7.13.19_selectedNYNJcounties.xlsx')