In [172]:
import pandas as pd
import numpy as np

In [173]:
pd.set_option('display.max_rows', 900)
pd.set_option('display.max_columns', 900)
pd.set_option('display.width', 850)

In [174]:
def get_start_stop_indices_state(df):
    state_idxs = df['STATE'].str.findall(r'\w\w')

    # build list of start and stop indices for states
    state_start_end_indices = []
    for s in state_idxs.items():
        if s[1]:
            state_start_end_indices.append(s[0])

    # create list of (start,end) tuples
    l_of_start_end_tuples = []
    for i in range(len(state_start_end_indices)-1):
        l_of_start_end_tuples.append((state_start_end_indices[i], state_start_end_indices[i+1]))
        
    # need to account for last index of state and add last tuple
    l_of_start_end_tuples.append((l_of_start_end_tuples[-1][1],len(state_idxs)))  #######
    
    return l_of_start_end_tuples  

In [175]:
def get_start_stop_indices_counties_in_state(state_df):
    #print('IN GET_START_STOP_INDICES_COUNTIES_IN_STATE()')
    #print("state_df:",state_df)
    cty_idxs = state_df['County_name'].str.findall(r'(\w+\s?\w.*)')
    #print("cty_idxs:",cty_idxs)
    
    n_total_rows_for_all_counties = len(list(cty_idxs))
    #print("n_total_rows_for_all_counties:", n_total_rows_for_all_counties)
    
    cty_start_end_indices = []
    for c in cty_idxs.items():
        if c[1]:
            cty_start_end_indices.append(c[0])

    n_counties = len(cty_start_end_indices)
    #print("n_counties:",n_counties)

    # create list of (start,end) tuples
    l_of_start_end_tuples = []
    for i in range(n_counties-1):    
        l_of_start_end_tuples.append((cty_start_end_indices[i], cty_start_end_indices[i+1]))
        #print("l_of_start_end_tuples:", l_of_start_end_tuples)

    #print("l_of_start_end_tuples:", l_of_start_end_tuples)
    l_of_start_end_tuples.append( (l_of_start_end_tuples[-1][1], n_total_rows_for_all_counties) )

    return l_of_start_end_tuples 

In [176]:
df = pd.read_csv("data/homes_example.csv")
df = df.replace(np.nan, '', regex=True)
df = df.drop(columns=['Count of FACID'])     # TODO add back in later

# fill forward blank spaces in ratings column with previous value
col = df['Comp5star']
col = col.replace(r'', np.nan, regex=True)
col = col.fillna(method='ffill')
col = col.astype(int)
df['Comp5star'] = col

df

Unnamed: 0,STATE,County_name,Comp5star,PROVNUM,PROVNAME,Sum of MDScensus_avg,Sum of Beds_Avail
0,AK,Anchorage,4,25018,A,10,16
1,,,5,25025,B,11,17
2,,,5,25036,C,12,18
3,,Fairbanks North Star,3,25020,D,13,19
4,WY,Laramie,1,535013,E,17,28
5,,,1,535025,F,19,77
6,,,5,535032,G,21,14
7,,,5,535044,H,23,56
8,,Natrona,4,535024,K,45,56
9,,,4,535049,L,93,26


In [177]:
def add_new_columns(state_df):
    state_df['sum_r12'] = ''
    state_df['sum_r345'] = ''
    state_df['relocate'] = ''
    return state_df

In [178]:
def process_county(county_df):
    # check if rating of 1 or 2 does not exist first
    ratings = county_df['Comp5star']
    r = ratings[ratings != '']
    #print("ratings:\n",r)
    
    if (1 not in r.values) and (2 not in r.values):
        #print("NO 1 and NO 2")
        county_df.loc[0:0,'relocate'] = 'N/A'
        return county_df
    else:
        print("PROCESSING COUNTY:",county_df['County_name'][0])
        
        # ------- get indicies of '1s and 2s'
        i_1_2 = ratings.isin([1,2])
        i_1_2 = i_1_2[i_1_2].index
                
        sum_1_2 = 0
        for i in i_1_2:
            print("idx_of_1_or_2:", i, "->", county_df['Sum of MDScensus_avg'][i])
            sum_1_2 =  sum_1_2 + county_df['Sum of MDScensus_avg'][i]
            
        print("sum of 1s & 2s:", sum_1_2)
        # insert sum into same line as name of the county
        county_df['sum_r12'][0] = sum_1_2
        
        
        # ------ get indicies of '3s, 4s, and 5s'
        i_3_4_5 = ratings.isin([3,4,5])
        i_3_4_5 = i_3_4_5[i_3_4_5].index
                
        sum_3_4_5 = 0
        for i in i_3_4_5:
            print("idx_of_3_or_4_or_5:", i, "->", county_df['Sum of Beds_Avail'][i])
            sum_3_4_5 =  sum_3_4_5 + county_df['Sum of Beds_Avail'][i]
            
        print("sum of 3s & 4s & 5s:", sum_3_4_5)
        # insert sum into same line as name of the county
        county_df['sum_r345'][0] = sum_3_4_5
        
        # enough beds available, populate relocate column
        if sum_3_4_5 >= sum_1_2:
            county_df['relocate'][0] = 'RELOCATE'
            # TODO - include provname and provnum
        else:
            county_df['relocate'][0] = 'NO_SPACE'
        
        
        print("------------------------------------------------")
        return county_df

In [179]:
state_idxs = get_start_stop_indices_state(df)
print("state start-end indicies:",state_idxs)
print()

l_df_out = []
for idx in state_idxs:
    start = idx[0]
    end   = idx[1]
    #print("////////////////////////////////////////////////////////////////////////////////////////////\n")
    state_df = (df[start:end]).reset_index(drop=True)
    state_df = add_new_columns(state_df)
    #print(state_df)
    #print("============================================================================================\n")
    
    counties_idxs = get_start_stop_indices_counties_in_state(state_df)
    #print("\tcounty start-end indicies:", counties_idxs)
    for idx in counties_idxs:
        start  = idx[0]
        end    = idx[1]
        county_df = (state_df[start:end]).reset_index(drop=True)
        county = process_county(county_df)
        
        l_df_out.append(county)

# rebuild original dataframe with new columns and write out to disk
df_out = (pd.concat(l_df_out)).reset_index(drop=True)
print(df_out.to_string())
df_out.to_csv("df_out.csv", index=False)


state start-end indicies: [(0, 4), (4, 10), (10, 18), (18, 22)]

PROCESSING COUNTY: Laramie
idx_of_1_or_2: 0 -> 17
idx_of_1_or_2: 1 -> 19
sum of 1s & 2s: 36
idx_of_3_or_4_or_5: 2 -> 14
idx_of_3_or_4_or_5: 3 -> 56
sum of 3s & 4s & 5s: 70
------------------------------------------------
PROCESSING COUNTY: Chittenden
idx_of_1_or_2: 0 -> 123
idx_of_1_or_2: 1 -> 45
sum of 1s & 2s: 168
idx_of_3_or_4_or_5: 2 -> 57
idx_of_3_or_4_or_5: 3 -> 12
idx_of_3_or_4_or_5: 4 -> 3
sum of 3s & 4s & 5s: 72
------------------------------------------------
PROCESSING COUNTY: Franklin
idx_of_1_or_2: 0 -> 77
sum of 1s & 2s: 77
idx_of_3_or_4_or_5: 1 -> 55
idx_of_3_or_4_or_5: 2 -> 66
sum of 3s & 4s & 5s: 121
------------------------------------------------
PROCESSING COUNTY: Apple
idx_of_1_or_2: 0 -> 10
idx_of_1_or_2: 1 -> 11
idx_of_1_or_2: 2 -> 12
sum of 1s & 2s: 33
sum of 3s & 4s & 5s: 0
------------------------------------------------
   STATE           County_name  Comp5star  PROVNUM PROVNAME  Sum of MDScensu

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
