In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
import pickle

In [2]:
def feature_select(input_df):
    output_df = input_df[['MedianIncome', 'MedianAge', 'Family', 'Nonfamily',
       'Householdsize0', 'Householdsize2', 'Pop Age 15+: Married',
       'Pop Age 15+: Nevermarried', 'Pop Age 15+: Divorced',
       'Pop Age 25+: Bachelor\'s Degree', 'Pop Age 25 Grad Professional',
       'Adult', 'Blackpop', 'Whitepop', 'Hispanicpop', 'Mobility', 'AREA',
       'Population_Density', 'Female_density', 'GDP', 'Is_Florida',
       'Is_Tennessee', 'Is_Alabama', 'TJ.Maxx', 'Marshalls', 'HomeGoods',
       'Lowe\'s_3m', 'The Home Depot_3m', 'Walmart_3m', 'Burlington_3m',
       'Kohl\'s_3m', 'Ross Dress for Less_3m', 'Lowe\'s_visits_3m',
       'The Home Depot_visits_3m', 'Walmart_visits_3m', 'Burlington_visits_3m',
       'Kohl\'s_visits_3m', 'Ross Dress for Less_visits_3m', 'Lowe\'s_1m',
       'The Home Depot_1m', 'Burlington_1m', 'Kohl\'s_1m',
       'Ross Dress for Less_1m', 'Lowe\'s_visits_1m',
       'The Home Depot_visits_1m', 'Burlington_visits_1m', 'Kohl\'s_visits_1m',
       'Ross Dress for Less_visits_1m', 'Aldi\'s_3m', 'Publix_3m',
       'The Fresh Market_3m', 'Whole Foods Market_3m', 'Winn-Dixie_3m',
       'Aldi\'s_visits_3m', 'Publix_visits_3m', 'The Fresh Market_visits_3m',
       'Whole Foods Market_visits_3m', 'Winn-Dixie_visits_3m', 'num_star',
       'tjmaxx_3m', 'Marshalls_3m', 'Homegoods_3m', 'tjmaxx_visits_3m',
       'Marshalls_visits_3m', 'Homegoods_visits_3m']]
    return output_df

########################################################################################
def winsorize(input_df):
    import pickle
    # load the mean and standard deviation of the original data distribution
    file = open('mean.pkl', 'rb')
    df_mean = pickle.load(file)
    df_mean = df_mean.drop(labels=['Population'])
    file.close()
    
    file = open('std.pkl', 'rb')
    df_std = pickle.load(file)
    df_std = df_std.drop(labels=['Population'])
    file.close()
    
    # winsorize using 3*std
    df_1 = pd.DataFrame()
    for i in input_df.columns:
        temp = input_df[i].copy()
        lo = temp - 3*df_std[i]
        up = temp + 3*df_std[i]
        temp[temp < lo] = lo
        temp[temp > up] = up

        df_1 = pd.concat([df_1, temp], axis=1, ignore_index=True, sort=False)
    df_1.columns = input_df.columns
    return df_1

########################################################################################
def min_max(df):
    # load the min and max value of the original data distribution
    import pickle
    file = open('max.pkl', 'rb')
    df_max = pickle.load(file)
    df_max = df_max.drop(labels=['Population'])
    file.close()
    
    file = open('min.pkl', 'rb')
    df_min = pickle.load(file)
    df_min = df_min.drop(labels=['Population'])
    file.close()
    
    output_df = (df - df_min) / (df_max - df_min)
    return output_df



In [3]:
def model_output(df):
    preprocessed_df = min_max(winsorize(feature_select(df)))
    # load the model
    import pickle
    file = open('rf.pkl', 'rb')
    rf = pickle.load(file)
    file.close()
    
    return rf.predict(preprocessed_df)

In [4]:
df = pd.read_csv('dataset.csv', index_col='Unnamed: 0')

In [5]:
#predict for tjmaxx
tjmaxx_out = model_output(df)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
#predict for marshalls
df['TJ.Maxx']=0
df['Marshalls']=1
marshall_out = model_output(df)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
#predict for homegoods
df['HomeGoods']=1
df['Marshalls']=0
homegoods_out = model_output(df)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
df['TJ.Maxx']=tjmaxx_out
df['Marshalls']=marshall_out
df['HomeGoods']=homegoods_out

In [12]:
#Alabama
alabama = df[df['Is_Alabama']== 1]
#tjmaxx -- 5 stores
ala_tj = alabama.nlargest(5, columns='TJ.Maxx')
ala_tj['is_TJMaxx'] = 1
ala_tj['is_Marshalls'] = 0
ala_tj['is_HomeGoods'] = 0

#marshalls -- 3 stores
ala_ma = alabama.nlargest(3, columns='Marshalls')
ala_ma['is_TJMaxx'] = 0
ala_ma['is_Marshalls'] = 1
ala_ma['is_HomeGoods'] = 0

#homegoods -- 3 stores
ala_hg = alabama.nlargest(3, columns='HomeGoods')
ala_hg['is_TJMaxx'] = 0
ala_hg['is_Marshalls'] = 0
ala_hg['is_HomeGoods'] = 1

In [13]:
#Florida
florida = df[df['Is_Florida']== 1]
#tjmaxx -- 10 stores
fl_tj = florida.nlargest(10, columns='TJ.Maxx')
fl_tj['is_TJMaxx'] = 1
fl_tj['is_Marshalls'] = 0
fl_tj['is_HomeGoods'] = 0

#marshalls -- 10 stores
fl_ma = florida.nlargest(10, columns='Marshalls')
fl_ma['is_TJMaxx'] = 0
fl_ma['is_Marshalls'] = 1
fl_ma['is_HomeGoods'] = 0

#homegoods -- 10 stores
fl_hg = florida.nlargest(10, columns='HomeGoods')
fl_hg['is_TJMaxx'] = 0
fl_hg['is_Marshalls'] = 0
fl_hg['is_HomeGoods'] = 1

In [14]:
#Tennessee
tennessee = df[df['Is_Tennessee']== 1]
#tjmaxx -- 5 stores
tn_tj = tennessee.nlargest(5, columns='TJ.Maxx')
tn_tj['is_TJMaxx'] = 1
tn_tj['is_Marshalls'] = 0
tn_tj['is_HomeGoods'] = 0

#marshalls -- 3 stores
tn_ma = tennessee.nlargest(3, columns='Marshalls')
tn_ma['is_TJMaxx'] = 0
tn_ma['is_Marshalls'] = 1
tn_ma['is_HomeGoods'] = 0

#homegoods -- 3 stores
tn_hg = tennessee.nlargest(3, columns='HomeGoods')
tn_hg['is_TJMaxx'] = 0
tn_hg['is_Marshalls'] = 0
tn_hg['is_HomeGoods'] = 1

In [15]:
#combine those data
dfList = [ala_tj, ala_ma, ala_hg, fl_tj, fl_ma, fl_hg, tn_tj, tn_ma, tn_hg]  # List of your dataframes
new_df = pd.concat(dfList)

In [16]:
new_df.to_csv("/Users/xujunlan/Desktop/597p/new_dataset.csv")