In [1]:
import numpy as np
import pandas as pd
from is_slum2 import get_slum_val
import matplotlib.pyplot as plt

In [2]:
%%capture
from tqdm.notebook import tqdm
tqdm().pandas()

In [3]:
rio = pd.read_csv('../data/rio/cleaned_2.csv')
mumbai1 = pd.read_csv('../data/mumbai/cleaned_housing.com.csv')
mumbai2 = pd.read_csv('../data/mumbai/cleaned_99acres.com.csv')
mumbai = pd.concat([mumbai1[['lat', 'lng', 'price', 'coord']],
                    mumbai2[['lat', 'lng', 'price_per_month', 'coord']
                    ].rename(columns={'price_per_month':'price'})]).rename(
                             columns={'lat':'latitude','lng':'longitude'})
hyderabad = pd.read_csv('../data/hyderabad/cleaned.csv')
chennai = pd.read_csv('../data/chennai/cleaned.csv')

In [8]:
def Prepare(df):
    df['coord'] = list(zip(df['latitude'], df['longitude']))
    df.drop(columns=['latitude', 'longitude'], inplace=True)
    print(df.duplicated().sum(), 'duplicates dropped.')
    df.drop_duplicates(inplace=True)
    return df

In [9]:
rio = Prepare(rio)

137 duplicates dropped.


In [10]:
mumbai = Prepare(mumbai)

7488 duplicates dropped.


In [11]:
hyderabad = Prepare(hyderabad)

33 duplicates dropped.


### Classifying

In [12]:
def get_classes(df, city):
    class_map = dict()
    uniques = list(set(df['coord']))
    for coord in tqdm(uniques):
        try: class_map[coord] = get_slum_val(city, (coord[0], coord[1]))
        except: class_map[coord] = np.nan
    print(len(uniques), 'unique locations classified. mapping back to dataset...')
    df['class'] = df.coord.progress_apply(lambda x: class_map[x])
    print('COMPLETE. saving to CSV...')
    df.to_csv(f'classified_{city}.csv', index=False)
    print('saved. value counts:', df['class'].value_counts(normalize=True).values)

In [13]:
get_classes(rio, 'rio')

HBox(children=(FloatProgress(value=0.0, max=1617.0), HTML(value='')))


1617 unique locations classified. mapping back to dataset...


HBox(children=(FloatProgress(value=0.0, max=7541.0), HTML(value='')))


COMPLETE. saving to CSV...
saved. value counts: [0.95574163 0.04425837]


In [14]:
get_classes(mumbai, 'mumbai')

HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))


3649 unique locations classified. mapping back to dataset...


HBox(children=(FloatProgress(value=0.0, max=25244.0), HTML(value='')))


COMPLETE. saving to CSV...
saved. value counts: [0.9603811 0.0396189]


In [15]:
get_classes(hyderabad, 'hyderabad')

HBox(children=(FloatProgress(value=0.0, max=1270.0), HTML(value='')))


1270 unique locations classified. mapping back to dataset...


HBox(children=(FloatProgress(value=0.0, max=2276.0), HTML(value='')))


COMPLETE. saving to CSV...
saved. value counts: [0.97494505 0.02505495]
