In [75]:
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import numpy as np

In [195]:
out_df = pd.DataFrame(columns=['Year', 'Total', 'ModFIPS'])

In [210]:
pop = pd.read_csv('population.csv',sep=',', dtype={'Id2': 'object'})
grouped = pop.groupby('Id2')

In [231]:
for county, group in grouped:
    ada = grouped.get_group(county)
    data = ada[['Year', 'Total']].dropna().astype('float')
    if data.index.size == 0:
        print('missing data: ' + county)
        continue
    model = smf.ols(formula="Total ~ Year", data=data)
    result = model.fit()
    future_years = pd.DataFrame(np.arange(2016,2026).astype('float'), columns=['Year'])
    sm.add_constant(future_years)
    new_data = pd.DataFrame({ 'Total': result.predict(future_years), 'Year': future_years['Year'],
               'ModFIPS': np.repeat(county, 10)  })
    existing_data = pd.DataFrame({'Total': data['Total'], 'Year': data['Year'],
                              'ModFIPS': np.repeat(county, data['Year'].size)})
    out_df = out_df.append(existing_data)
    out_df = out_df.append(new_data)

missing data: Id2


In [232]:
out_df['Year'] = out_df['Year'].astype('int')
out_df.to_csv('predicted_population.csv', columns=['Total','Year','ModFIPS'], index=False)
predicted_population = out_df

In [190]:
water = pd.read_csv('water.csv',sep=',', dtype={'ModFIPS': 'object', 'FIPS': 'int'})

In [177]:
grouped = water.groupby('FIPS')
out_df = pd.DataFrame(columns=['Water', 'Year', 'FIPS'])
missing = 0
for county, ada in grouped:
    ada = grouped.get_group(county)
    data = ada[['YEAR', 'TO-WTotl']].dropna().astype('float')
    data.columns = ['Year', 'Water']
    model = smf.ols(formula="Water ~ Year", data=data)
    result = model.fit()
    future_years = pd.DataFrame(np.arange(2016,2026).astype('float'), columns=['Year'])
    sm.add_constant(future_years)
    new_data = pd.DataFrame({ 'Water': result.predict(future_years), 'Year': future_years['Year'],
           'FIPS': np.repeat(county.astype('int'), 10)  })
    w = np.array(data['Water'])
    y = np.array(data['Year'])
    nan = np.nan
    if w.size != 3:
        missing = missing + 1
        continue
        
    existing_data = pd.DataFrame({
        'Water': [w[0], nan, nan, nan, nan, w[1], nan, nan, nan, nan, w[2]] },
                             index=[y[0], 2001, 2002, 2003, 2004, y[1], 2006, 2007, 2008, 2009, y[2]])
    existing_data = existing_data.interpolate(method='index')
    existing_data = pd.DataFrame({ 'Water': existing_data['Water'], 'Year': existing_data.index,
                             'FIPS': np.repeat(county.astype('int'), 11) })
    out_df = out_df.append(existing_data)
    out_df = out_df.append(new_data)

In [182]:
out_df['FIPS'] = out_df['FIPS'].astype('int')
out_df['Year'] = out_df['Year'].astype('int')

In [193]:
mod_fips = pd.DataFrame({'FIPS': water['FIPS'], 'ModFIPS': water['ModFIPS']})
out_with_mod_fips = out_df.join(mod_fips.set_index('FIPS'), on='FIPS')

In [194]:
out_with_mod_fips.to_csv('predicted_water.csv', columns=['Water','Year','FIPS', 'ModFIPS'], index=False)


Unnamed: 0,ModFIPS,Total,Year,FIPS,Water
0,01003,160354.000000,2005,1003.0,69.010000
1,01003,160354.000000,2005,1003.0,69.010000
2,01003,160354.000000,2005,1003.0,69.010000
3,01003,169162.000000,2006,1003.0,68.080000
4,01003,169162.000000,2006,1003.0,68.080000
5,01003,169162.000000,2006,1003.0,68.080000
6,01003,171769.000000,2007,1003.0,67.150000
7,01003,171769.000000,2007,1003.0,67.150000
8,01003,171769.000000,2007,1003.0,67.150000
9,01003,174439.000000,2008,1003.0,66.220000
