# Project Phase 4: Data Mining
Group 3:
- Jason Lam
- Angus Young
- Ann Soong

In [4]:
import pandas as pd
import numpy as np

# For displaying graphs
import matplotlib.pyplot as plt

# For data preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold # https://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.impute import KNNImputer

# For data splitting
from sklearn.model_selection import train_test_split

# For model building: Decision Tree, Gradient Boosting and Random Forest
# From:
#   https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
#       https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_text.html
#       https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# For model analysis
from sklearn.metrics import confusion_matrix

## Part A
Since our database is a local database, we decided to use the .csv files instead, in order to remove the limitations of requiring a local database.

In [8]:
# Get the data from .csv files
df_fact_table = pd.read_csv('clean_data/fact_table.csv')
df_country = pd.read_csv('clean_data/country.csv')
df_education = pd.read_csv('clean_data/education.csv')
df_event = pd.read_csv('clean_data/event.csv')
df_health = pd.read_csv('clean_data/health.csv')
df_month = pd.read_csv('clean_data/month.csv')
df_population = pd.read_csv('clean_data/population.csv')
df_qualityOfLife = pd.read_csv('clean_data/quality_of_life.csv')

# Country and Event both have 'Name' columns. Rename to be specific.
df_country = df_country.rename(
    columns={
        'Name': 'country_name'
    }
)
df_event = df_event.rename(
    columns={
        'Name': 'event_name'
    }
)

# Join all the data into one dataframe
df_tmp = pd.merge(df_fact_table, df_country,  how='left', left_on=['country_key'], right_on = ['Country_key'])
df_tmp = pd.merge(df_tmp, df_month,  how='left', left_on=['month_key'], right_on = ['Month_Key'])
df_tmp = pd.merge(df_tmp, df_education,  how='left', left_on=['education_key'], right_on = ['education_key'])
df_tmp = pd.merge(df_tmp, df_event,  how='left', left_on=['Event_key'], right_on = ['Event_key'])
df_tmp = pd.merge(df_tmp, df_health,  how='left', left_on=['health_key'], right_on = ['health_key'])
df_tmp = pd.merge(df_tmp, df_population,  how='left', left_on=['population_key'], right_on = ['population_key'])
df_tmp = pd.merge(df_tmp, df_qualityOfLife,  how='left', left_on=['qol_key'], right_on = ['qol_key'])
# Drop keys
df_dataset = df_tmp.drop(columns=['country_key', 'Country_key', 'month_key', 'Month_Key', 'education_key', 'Event_key', 'health_key', 'population_key', 'qol_key'])

# Dropping attributes that we feel are not relevant to our data mining
df_dataset = df_dataset.drop(columns=[
    'country_name', 'Region', 'Continent', 'Currency', 'Capital',
    'Month', 'Year', 'Quarter', 'Decade',
    'event_name', 'Start_date','End_date', 'Start_month', 'End_month'])

df_dataset.head(2446)
# df_data.isnull()

Unnamed: 0,qol,di,hdi,Total_population,Population_growth_percent,Total_labour_force,Birth_rate,Death_rate,GNI_per_capita,Income_group,...,unemployment_total,people_basic_handwashing_facilities,people_basic_handwashing_facilities_rural,people_basic_handwashing_facilities_urban,people_using_at_least_basic_sanitation_services,people_using_at_least_basic_sanitation_services_rural,people_using_at_least_basic_sanitation_services_urban,people_using_at_least_basic_drinking_water_services,people_using_at_least_basic_drinking_water_services_rural,people_using_at_least_basic_drinking_water_services_urban
0,4.0,2.0,4.0,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
1,4.0,2.0,4.0,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
2,4.0,2.0,4.0,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
3,4.0,2.0,4.0,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
4,4.0,2.0,4.0,139035505.0,1.484747,53123178.0,24.053,6.205,550.0,Lower middle income,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,1.0,1.0,2.0,329484123.0,0.350911,163738061.0,11.955,8.947,64530.0,High income,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2442,1.0,1.0,2.0,329484123.0,0.350911,163738061.0,11.955,8.947,64530.0,High income,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2443,1.0,1.0,2.0,329484123.0,0.350911,163738061.0,11.955,8.947,64530.0,High income,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2444,1.0,1.0,2.0,329484123.0,0.350911,163738061.0,11.955,8.947,64530.0,High income,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042


In [9]:
# For debugging only
df_dataset.to_csv('data.csv', index=False)

### Data summarization

In [4]:
# Data summarization part goes here!

### Data transformation

In [6]:
# 


# Check to see if we having missing values in our data
missing_rows = df_dataset[df_dataset.isnull().any(axis=1)]
print(missing_rows.shape)
# On the chance we missed missing values, we impute them using knn imputer
imputer_knn = KNNImputer(n_neighbors=5)
# data_imputed_knn = imputer_knn.fit_transform(data_for_impute)

# Split the data into training and testing

# separate the features and the labels to be used in model development
# data = df_train.drop(columns=['split', 'category'])
# labels = df_train['category'].to_numpy(copy=True)

(923, 85)
