# Redoing Classification as Regression (with more data pre-processing in the mix)

Rather than making this problem about classifiying particular classes, we can re-mould this problem as a regression problem that attempts to predicts GPS coordinates.

This is how we'll do that:

1. Extract GPS centroids for each district
2. use the x and y coords as targets in a prediction regression




In [3]:
import pandas as pd
import geopandas as gpd
import sys
sys.path.append("../classifier")
from scripts.data_processor import DataProcessor, process_data
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Lasso, LassoCV, MultiTaskLassoCV

from yellowbrick.model_selection import LearningCurve


In [5]:
## Load data
names = pd.read_feather("../data/processed/after_stata_comp_region_gaul.feather")


In [33]:
adm2_en_name_change = {'Kwania' : 'Apac',
                       'Madi Okollo' : 'Arua',
                       'Omoro' : 'Gulu',
                       'Kikuube' : 'Hoima',
                       'Bugweri' : 'Iganga',
                       'Mbarara' : 'Kabale',
                       'Rubanda' : 'Kabale',
                       'Rukiga' : 'Kabale',
                       'Bunyangabu' : 'Kabarole',
                       'Namisindwa' : 'Manafwa',
                       'Kazo' : 'Kiruhura',
                       'Karenga' : 'Kaabong',
                       'Kapelebyong' : 'Amuria'}
adm4_en_name_change = {'Chawente' : 'Cawente',  
                       'Agulu Division' : 'Apac Town Council',
                       'Akere Division' : 'Apac Town Council',
                       'Arocha Division' : 'Apac Town Council',
                       'Atik Division' : 'Apac Town Council',
                       'Aii-Vu' : 'Aii',
                       'Bileafe' : 'Beleafe',
                       'Arua Hill Division' : 'Arua Hill',
                       'River Oli Division' : 'Oli River',
                       'Harugale' : 'Harugali',
                       'Nduguto' : 'Ndugutu',
                       'Central Divison' : 'Central Division',
                       'Ishaka Divison' : 'Ishaka Division',
                       'Bar Dege Division' : 'Bar',
                       'Laroo Division' : 'Laroo',
                       'Layibi Division' : 'Layibi',
                       'Pece Division' : 'Pece',
                       'Mparo Divison' : 'Mparo Division',
                       'Bulamogi' : 'Bulamagi',
                       'Masese Walukuba' : 'Walukuba',
                       'Bubaare' : 'Bubare',
                       'Labori' : 'Labor',
                       'Nyamweru' : 'Nyamweeru',
                       'Masiira' : 'Masira',
                       'Bulaago' : 'Bulago',
                       'Lwajje' : 'Lwaje',
                       'Sigulu Island' : 'Sigulu Islands',
                       'Madi Opei' : 'Madi',
                       'Palabek Gem' : 'Palabek',
                       'Palabek Kal' : 'Palabek',
                       'Palabek Ogili' : 'Ogili',
                       'Wakisi Division' : 'Wakisi',
                       'Ssi-Bukunja' : "Ssi",
                       'Kabweza Kyegegwa' : 'Kyegegwa',
                       'Oluffe' : 'Olufe',
                       'Njeru Division' : "Njeru Town Council",
                       'Nyenga Division' : 'Nyenga',
                       'Katebwa' : 'Kateebwa',
                       'Hakibaale' : 'Hakibale',
                       'Harugogo' : 'Harugongo',
                       'Katebwa' : 'Kateebwa',
                       'Kagango Division' : 'Kagango',
                       'Kabwohe Division' : 'Kabwohe',
                       'Sheema Central Division' : 'Sheema Town Council',
                       'Dokolo Tc' : 'Dokolo Town Council',
                       'Iceme' : 'Icheme',
                       'Kashozi Division' : 'Kashozi',
                       'Shuuku Town Council' : 'Shuuku',
                       'Bala' : 'Balla',
                       'Katiira' : 'Katira',
                       'Agwata' : 'Agwatta',
                       'Attiak' : 'Atiak',
                       'Pabo' : 'Pabbo',
                       'Kkingo' : 'Kingo',
                       'Lopeei' : 'Lopei',
                       'Anaka Payira' : 'Anaka',
                       'Anaka Town Council' : 'Anaka',
                       'Kito' : 'Kitto',
                       'Butalangu Town Council' : 'Nakaseke Butalangu Town Counc',
                       'Kasangombe' : 'Kaasangombe',
                       'Koch-Goma' : 'Koch',
                       'Bukokho' : 'Bukoho',
                       'Mpumudde Division' : 'Kimaka',
                       'Lira Palwo' : 'Lira',
                       'Kotomol'  : 'Kotomor',
                       'Kikaatsi' : 'Kikatsi',
                       'Acaba' : 'Achaba',
                       'Rubiriziri Town Council' : 'Rubirizi Town Council',
                       'Nakawa Division' : 'Nakawa',
                       'Karenga Town Council' : 'Karenga',
                       'Mabende' : 'Kabende',
                       'Kitholhu' : 'Kitholu',
                       'Endiinzi' : 'Endinzi',
                       'Bisheshe Division' : 'Bisheshe',
                       'Mpondwe-Lhubiriha' : 'Mpondwe',
                       'Katwe Kabatoro Town Council' : 'Lake Kabatoro Town Council',
                       'Bukwo' : 'Bukwa',
                       'Olio' : 'Serere',
                       'Pacara' : 'Pachara',
                       'Awelo' : 'Awello'}

# Load district data
subcounties = (
    gpd.read_file("../data/raw/district_2020_shapefile/uga_admbnda_adm4_ubos_20200824.shp")
    .assign(centroid = lambda df: df.geometry.centroid,
            x_coord = lambda df: df.centroid.x,
            y_coord = lambda df: df.centroid.y,
            ADM2_EN = lambda df: df['ADM2_EN'].str.strip(),
            ADM4_EN = lambda df: df['ADM4_EN'].str.strip().str.replace("Divison", 'Division').str.replace("  ", ' '))
    .replace({'ADM2_EN' : adm2_en_name_change})
    .replace({'ADM4_EN' : adm4_en_name_change})
             ) # not reprojecting because it's not really changing much 

subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kabale') & (df['ADM4_EN'] == 'Northern Division'), 'ADM4_EN'] = 'Kabale Northern'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kabale') & (df['ADM4_EN'] == 'Southern Division'), 'ADM4_EN'] = 'Kabale Southern'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kabale') & (df['ADM4_EN'] == 'Central Division'), 'ADM4_EN'] = 'Kabale Central'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Koboko') & (df['ADM4_EN'] == 'Southern Division'), 'ADM4_EN'] = 'South Division'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kampala') & (df['ADM4_EN'] == 'Central Division'), 'ADM4_EN'] = 'Kampala Central'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kapchorwa') & (df['ADM4_EN'] == 'Eastern Division'), 'ADM4_EN'] = 'East Division'
subcounties.loc[lambda df: (df['ADM2_EN'] == 'Kapchorwa') & (df['ADM4_EN'] == 'Western Division'), 'ADM4_EN'] = 'West Division'


  .assign(centroid = lambda df: df.geometry.centroid,

  x_coord = lambda df: df.centroid.x,

  y_coord = lambda df: df.centroid.y,


In [34]:
district_name_change = {
    'Luweero' : 'Luwero'
}

subcounty_name_change = {
    'Jinja Central' : 'Central Division'
}

# deduplicate names to make it easier to work with
# we'll re-duplicate later, once we merge
processed_names = (
    names
    .assign(district = lambda df: df.district.str.title().str.replace('  ', ' '),
            sub_county = lambda df: df.sub_county.str.title().str.replace('  ', ' '))
    .replace({'district' : district_name_change})
    .replace({'sub_county' : subcounty_name_change})
    [['district', 'sub_county', 'constituency', 'surname', 'region_gaul_comp', 'gaul']]
    )

In [35]:
merged_geo_df = subcounties.merge(processed_names, left_on = ['ADM2_EN', 'ADM4_EN'], right_on =['district', 'sub_county'],
                                  how='outer', indicator=True)

In [36]:
merged_geo_df._merge.value_counts()

both          12822026
right_only     1835031
left_only          310
Name: _merge, dtype: int64

In [32]:
merged_geo_df.query("_merge == 'left_only'")[['ADM1_EN','ADM2_EN', 'ADM3_EN' ,'ADM4_EN']].query("ADM4_EN.str.contains('Acowa')")

Unnamed: 0,ADM1_EN,ADM2_EN,ADM3_EN,ADM4_EN
123280,Eastern,Kapelebyong,Kapelebyong,Acowa


In [18]:
merged_geo_df.query("_merge == 'right_only'")[['district', 'sub_county', 'constituency']].drop_duplicates()

Unnamed: 0,district,sub_county,constituency
12779738,Adjumani,Pachara,ADJUMANI WEST COUNTY
12783483,Amolatar,Awello,KIOGA COUNTY
12788520,Amuria,Acowa,KAPELEBYONG COUNTY
12797176,Amuria,Okungur,KAPELEBYONG COUNTY
12802160,Amuria,Kapelebyong,KAPELEBYONG COUNTY
...,...,...,...
14477395,Wakiso,Katabi,BUSIRO COUNTY SOUTH
14529161,Wakiso,Ssisa,BUSIRO COUNTY SOUTH
14576735,Wakiso,Masuliita Town Council,BUSIRO COUNTY NORTH
14583136,Wakiso,Nangabo,KYADONDO COUNTY EAST


In [10]:
df = (
    merged_geo_df[['geometry', 'district', 'surname', 'freq', 'centroid', 'x_coord', 'y_coord']]
    .pipe(process_data, 'surname')
    [['surname', 'x_coord', 'y_coord']]
    
    )

  df[features]


In [11]:
feature_pipeline = Pipeline(
    [
        ('tfidf', TfidfVectorizer(ngram_range = (1,3), analyzer='char_wb', lowercase=False, min_df=3)),
        
    ]
)

labels = LabelEncoder()

df_sample = df.sample(10000)

X_train, X_test, y_train, y_test = train_test_split(
    df_sample['surname'], df_sample[['x_coord', 'y_coord']], random_state=8, test_size=.3
)

feature_pipeline.fit(X_train)

X_train = feature_pipeline.transform(X_train)
X_test = feature_pipeline.transform(X_test)


In [12]:
model = MultiTaskLassoCV()

model.fit(X_train.toarray(), y_train)