# Project Phase 4: Data Mining
Group 3:
- Jason Lam
- Angus Young
- Ann Soong

In [1]:
import pandas as pd
import numpy as np

# For displaying graphs
import matplotlib.pyplot as plt

# For data preprocessing
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold # https://scikit-learn.org/stable/modules/feature_selection.html
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For data splitting
from sklearn.model_selection import train_test_split

# For model building: Decision Tree, Gradient Boosting and Random Forest
# From:
#   https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
#       https://scikit-learn.org/stable/modules/generated/sklearn.tree.export_text.html
#       https://scikit-learn.org/stable/modules/generated/sklearn.tree.plot_tree.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
#   https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

# For model analysis
from sklearn.metrics import confusion_matrix

np.random.seed(3)

## Part A
Since our database is a local database, we decided to use the .csv files instead, in order to remove the limitations of requiring a local database.

In [2]:
# Get the data from .csv files
df_fact_table = pd.read_csv('clean_data/fact_table.csv')
df_country = pd.read_csv('clean_data/country.csv')
df_education = pd.read_csv('clean_data/education.csv')
df_event = pd.read_csv('clean_data/event.csv')
df_health = pd.read_csv('clean_data/health.csv')
df_month = pd.read_csv('clean_data/month.csv')
df_population = pd.read_csv('clean_data/population.csv')
df_qualityOfLife = pd.read_csv('clean_data/quality_of_life.csv')

# Country and Event both have 'Name' columns. Rename to be specific.
df_country = df_country.rename(
    columns={
        'Name': 'country_name'
    }
)
df_event = df_event.rename(
    columns={
        'Name': 'event_name'
    }
)

# Join all the data into one dataframe
df_tmp = pd.merge(df_fact_table, df_country,  how='left', left_on=['country_key'], right_on = ['Country_key'])
df_tmp = pd.merge(df_tmp, df_month,  how='left', left_on=['month_key'], right_on = ['Month_Key'])
df_tmp = pd.merge(df_tmp, df_education,  how='left', left_on=['education_key'], right_on = ['education_key'])
df_tmp = pd.merge(df_tmp, df_event,  how='left', left_on=['Event_key'], right_on = ['Event_key'])
df_tmp = pd.merge(df_tmp, df_health,  how='left', left_on=['health_key'], right_on = ['health_key'])
df_tmp = pd.merge(df_tmp, df_population,  how='left', left_on=['population_key'], right_on = ['population_key'])
df_tmp = pd.merge(df_tmp, df_qualityOfLife,  how='left', left_on=['qol_key'], right_on = ['qol_key'])
# Drop keys
df_dataset = df_tmp.drop(columns=['country_key', 'Country_key', 'month_key', 'Month_Key', 'education_key', 'Event_key', 'health_key', 'population_key', 'qol_key'])

df_dataset.head(2446)

Unnamed: 0,qol,di,hdi,country_name,Region,Continent,Currency,Capital,Total_population,Population_growth_percent,...,unemployment_total,people_basic_handwashing_facilities,people_basic_handwashing_facilities_rural,people_basic_handwashing_facilities_urban,people_using_at_least_basic_sanitation_services,people_using_at_least_basic_sanitation_services_rural,people_using_at_least_basic_sanitation_services_urban,people_using_at_least_basic_drinking_water_services,people_using_at_least_basic_drinking_water_services_rural,people_using_at_least_basic_drinking_water_services_urban
0,4.0,2.0,4.0,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
1,4.0,2.0,4.0,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
2,4.0,2.0,4.0,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
3,4.0,2.0,4.0,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
4,4.0,2.0,4.0,Bangladesh,South Asia,Asia,Bangladeshi taka,Dhaka,139035505.0,1.484747,...,4.119,31.433841,23.044502,49.266924,42.114532,39.003389,48.727843,96.691589,96.110444,97.926927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2441,1.0,1.0,2.0,United States,North America,North America,U.S. dollar,"Washington, D.C.",329484123.0,0.350911,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2442,1.0,1.0,2.0,United States,North America,North America,U.S. dollar,"Washington, D.C.",329484123.0,0.350911,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2443,1.0,1.0,2.0,United States,North America,North America,U.S. dollar,"Washington, D.C.",329484123.0,0.350911,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042
2444,1.0,1.0,2.0,United States,North America,North America,U.S. dollar,"Washington, D.C.",329484123.0,0.350911,...,4.620,100.000000,100.000000,100.000000,99.881540,99.996330,99.853020,98.860837,95.511316,99.693042


### Data summarization

In [3]:
# Data summarization part goes here!

### Data transformation

In [4]:
# Dropping attributes that we feel are not relevant or redundant to our data mining
df_processing = df_dataset.drop(columns=[
    'country_name', 'Region', 'Continent', 'Currency', 'Capital', 'Income_group',
    'Month', 'Year', 'Quarter', 'Decade',
    'event_name', 'Disaster_group', 'Disaster_subgroup', 'Start_date','End_date', 'Start_month', 'End_month'
])

# Remove attributes that have insufficient data
df_processing = df_processing.drop(columns=[
    'primary_completion_rate_percent_m', 'primary_completion_rate_percent', 'primary_enroll_rate_percent_net_f', 'primary_enroll_rate_percent_net_m', 'spending_education_percent_gdp',
    'people_using_safely_managed_drinking_water_services', 'community_health_workers', 'specialist_surgical_workforce',
    'incidence_of_malaria', 'intermittent_preventive_treatment_of_malaria_in_pregnancy','use_of_insecticide_treated_bed_nets', 'children_with_fever_receiving_antimalarial_drugs', 'malaria_cases_reported',
    'people_using_safely_managed_sanitation_services', 'external_health_expenditure_per_capita'
])

# Fill categorical Null event data with 'Not Specified'
df_processing = df_processing.fillna(value={'Disaster_type':'Not Specified'})

# Separate the features and the labels to be used in model development
df_data = df_processing.drop(columns=['qol', 'di', 'hdi'])
df_labels = df_processing['qol'].to_numpy(copy=True)

# Split the data into training and testing
train_data, test_data, train_labels, test_labels = train_test_split(df_data, df_labels, test_size=0.2, random_state=69) 

# State numerical columns and categorical columns
categorical_columns = ['Disaster_type']
numerical_columns = df_data.drop(columns=categorical_columns).columns

# Use pipelines to transform our data
numerical_pipeline = Pipeline([
    ('imputer', KNNImputer(n_neighbors=5)), # For imputing missing values
    ('std_scaler', StandardScaler()), # To scale the values so all attributes have more equal weight
])

full_pipeline = ColumnTransformer([
    ("num", numerical_pipeline, numerical_columns), # Numerical values set to the numerical pipeline
    ("cat", OneHotEncoder(), categorical_columns), # Categorical values get encoded
])

# Remove numerical columns where values are 20% the same
selector = VarianceThreshold(threshold=(0.8))
df_numerical = df_data.drop(columns=categorical_columns)
selector.fit(df_numerical)
low_var_col = [
    column for column in df_numerical.columns
        if column not in df_numerical.columns[selector.get_support()]
]

for features in low_var_col:
    print('COLUMN TO BE DROPPED: ' + features)

train_data.drop(low_var_col,axis=1)
test_data.drop(low_var_col,axis=1)

# Transform the data via the Pipelines
train_data = full_pipeline.fit_transform(train_data)
test_data = full_pipeline.fit_transform(test_data)

COLUMN TO BE DROPPED: Population_growth_percent
COLUMN TO BE DROPPED: population_growth


In [5]:
# For debugging only
# train_data.to_csv('data.csv', index=False)\
# print(train_data)

## Part B

### Decision Tree Model

In [6]:
# Train data is called: train_data
# Train labels is called: train_labels
# Test data is called: test_data
# Test labels is called: test_labels

### Gradient Boosting Model

### Random Forest Model