# Chart Recommender
This file provides an example of running the chart recommender, including the single chart recommender and the multiple-charts recommender.     
The input is a data table in Pandas.DataFrame.  

### Single-Chart Recommender
The output is the recommended charts, described by the column selection and chart type.     

### MV Recommender
The output is the recommended MVs, described as a list of charts.

Notes of limitations:
- A chart can encode a max number of 4 data columns.
- A MV can have a max number of 12 charts.
- The predicted chart type is limited to ('area', 'bar', 'scatter', 'line', 'pie')

In [5]:
import pandas as pd
import json
import numpy as np
import itertools
import sys
import re
import altair as alt

import torch
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
import torch.nn.functional as nnf

from model.encodingModel import ChartTypeNN, ChartTypeLSTM, ScoreNetLSTM
# from utils.helper import softmax, get_data_feature_by_column, get_embed_feature_by_column, get_all_charts_scores, charts_to_features


In [6]:
from utils.ChartRecommender import ChartRecommender
from utils.VegaLiteRender import VegaLiteRender

In [2]:
from PIL import Image

In [None]:
%load_ext autoreload
%autoreload 2

## Load pretrained word-embedding model

In [4]:
word_embedding_model_path = 'utils/en-50d-200000words.vec'

word_embedding_dict = {}
with open(word_embedding_model_path, encoding="utf8") as file_in:
    lines = []
    for idx, line in enumerate(file_in):
        if idx == 0: ## line 0 is invalid
            continue 
        word, *features = line.split()
        word_embedding_dict[word] = np.array(features)

## Load trained single-chart assessment model and chart type prediction model

In [5]:
gpu = torch.device('cuda:0')

column_score_model = ScoreNetLSTM(input_size=96, seq_length = 4, batch_size=2, pack = True).to(gpu)
column_score_model.load_state_dict(torch.load('trainedModel/singleChartModel.pt', map_location=gpu))
column_score_model.eval()

chart_type_model = ChartTypeLSTM(input_size = 96, hidden_size = 400, seq_length = 4, num_class = 9, bidirectional = True).to(gpu)
chart_type_model.load_state_dict(torch.load('trainedModel/chartType.pt', map_location=gpu))
chart_type_model.eval()

ChartTypeLSTM(
  (lstm): LSTM(96, 400, batch_first=True, bidirectional=True)
  (dense): Sequential(
    (linear0): Linear(in_features=3200, out_features=2000, bias=True)
    (rulu0): LeakyReLU(negative_slope=0.01)
    (dropout0): Dropout(p=0.4, inplace=False)
    (linear1): Linear(in_features=2000, out_features=1600, bias=True)
    (rulu1): LeakyReLU(negative_slope=0.01)
    (dropout1): Dropout(p=0.4, inplace=False)
    (linear2): Linear(in_features=1600, out_features=1200, bias=True)
    (rulu2): LeakyReLU(negative_slope=0.01)
    (dropout2): Dropout(p=0.4, inplace=False)
    (linear4): Linear(in_features=1200, out_features=9, bias=True)
    (output): ReLU()
  )
)

## Data loader and pre-processing

In [82]:
ignored_columns = ['First Name', 'Middle Name', 'Last Name', 'Street Address', 'createdAt', 'Email',
                   'formSubmissionID', 'Date of Birth', 'Telephone', 'Call Number', 'Zipcode', 'City', 'Major', 'Race (Check all that apply, optional)']
df = pd.read_csv('csv/STU2_TAPDINTO-STEM Student Demographic Form_responses.csv').drop(columns=ignored_columns)

# df = pd.read_csv('csv/STU2_TAPDINTO-STEM Student Demographic Form_responses.csv')[['Veteran', 'GPA', 'Transfer Student', 'US Citizen', 'Year', 'Gender']]

bins = [0, 2.0, 2.5, 3.0, 3.5, 4.0]
labels = ['Below 2.0', '2.0-2.5', '2.5-3.0', '3.0-3.5', '3.5-4.0']
# labels = [0, 1, 2, 3, 4]

df.replace(to_replace=['none'], value=np.nan, inplace=True)
df['GPA'] = pd.to_numeric(df['GPA'])

df['GPA'] = pd.cut(df['GPA'], bins=bins, labels=labels)
# df = df.drop(['GPA']

In [83]:
# df = pd.read_csv('csv/penguins.csv')
# df = pd.read_csv('csv/countries.csv') #flawed
# df = pd.read_csv('csv/seattle-weather.csv')


# df = pd.read_csv('csv/STU3_TAPDINTO-STEM Student Awareness and Opportunities Form_responses.csv').drop(columns=['createdAt', 'formSubmissionID'])

chartRecommender = ChartRecommender(df, word_embedding_dict, column_score_model, chart_type_model)

In [84]:
len(df)


151

In [85]:
## the dataset
chartRecommender.df.head()

Unnamed: 0,State,Call Type,Gender,US Citizen,US Permanent Resident,Year,GPA,Degree Completion Date,Transfer Student,Veteran,Ethnicity (optional),Male parent/guardian highest educational degree,Female parent/guardian highest educational degree,Parent/Household Annual Income (optional)
0,FL,Voice,Female,Yes,No,Sophomore,3.0-3.5,2025-05-10,No,No,Not Hispanic or Latino,Technical training,"Master's degree (M.A., M.S., etc.)","Between $80,000 and $100,000"
1,Ohio,Voice,Female,Yes,Yes,Senior,3.5-4.0,2023-05-11,No,No,Not Hispanic or Latino,Associates degree,Bachelors degree (B.A. or B.S.),I don't know
2,HI,Voice,Female,Yes,Yes,Sophomore,3.0-3.5,2023-05-26,Yes,No,Not Hispanic or Latino,High school or GED,High school or GED,"Between $20,000 and $40,000"
3,CA,Voice,Female,Yes,Yes,Sophomore,3.5-4.0,2025-05-15,No,No,Not Hispanic or Latino,"Doctorate (Ph.D., M.D., D.D.S., D.V.M., J.S.D....","Master's degree (M.A., M.S., etc.)",Prefer not to answer
4,Nevada,Voice,Female,Yes,No,Sophomore,3.0-3.5,2025-06-12,No,No,Not Hispanic or Latino,Associates degree,High school or GED,"Between $40,000 and $60,000"


In [86]:
## the fields/columns of the dataset
chartRecommender.fields

{0: {'name': 'State', 'index': 0, 'type': 'nominal'},
 1: {'name': 'Call Type', 'index': 1, 'type': 'nominal'},
 2: {'name': 'Gender', 'index': 2, 'type': 'nominal'},
 3: {'name': 'US Citizen', 'index': 3, 'type': 'nominal'},
 4: {'name': 'US Permanent Resident', 'index': 4, 'type': 'nominal'},
 5: {'name': 'Year', 'index': 5, 'type': 'nominal'},
 6: {'name': 'GPA', 'index': 6, 'type': 'nominal'},
 7: {'name': 'Degree Completion Date', 'index': 7, 'type': 'temporal'},
 8: {'name': 'Transfer Student', 'index': 8, 'type': 'nominal'},
 9: {'name': 'Veteran', 'index': 9, 'type': 'nominal'},
 10: {'name': 'Ethnicity (optional)', 'index': 10, 'type': 'nominal'},
 11: {'name': 'Male parent/guardian highest educational degree',
  'index': 11,
  'type': 'nominal'},
 12: {'name': 'Female parent/guardian highest educational degree',
  'index': 12,
  'type': 'nominal'},
 13: {'name': 'Parent/Household Annual Income (optional)',
  'index': 13,
  'type': 'nominal'}}

In [87]:
## computed features for each field/column (that are fed into the DL models)
print(type(chartRecommender.feature_dict))

## the features of the first field/column
print(chartRecommender.feature_dict.keys())

## the size of feature
print(np.array(chartRecommender.feature_dict[0]).shape)

<class 'dict'>
dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
(96,)


## Single chart recommender
Return a DataFrame:
- indices: the column indices encoded by this chart
- column_selection_score: the predicted score for the column selection. Min-max normed.
- chart type: the chart type ('area', 'bar', 'scatter', 'line', 'pie')
- chart_type_prob: the likelihood that the selected columns are encoded by the chart type
- final_score: the overall score which is column_selection_score * chart_type_prob

In [88]:
## rank the results by the final_score
recommended_charts = pd.DataFrame.from_records(chartRecommender.charts).sort_values(by='final_score', ascending = False).reset_index(drop=True)
recommended_charts.head(5)

Unnamed: 0,indices,fields,column_selection_score,chart_type,chart_type_prob,final_score,n_column
0,"(8, 9, 11)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.774161,bar,0.999962,0.774132,3
1,"(8, 9, 12)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.772159,bar,0.999958,0.772126,3
2,"(7, 8, 13)","[{'name': 'Degree Completion Date', 'index': 7...",0.898115,area,0.857789,0.770393,3
3,"(8, 9, 13)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.770414,bar,0.999954,0.770379,3
4,"(4, 8, 13)","[{'name': 'US Permanent Resident', 'index': 4,...",0.839053,area,0.910792,0.764203,3


In [94]:
len(recommended_charts)

1435

In [89]:
recommended_charts.head(10)

Unnamed: 0,indices,fields,column_selection_score,chart_type,chart_type_prob,final_score,n_column
0,"(8, 9, 11)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.774161,bar,0.999962,0.774132,3
1,"(8, 9, 12)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.772159,bar,0.999958,0.772126,3
2,"(7, 8, 13)","[{'name': 'Degree Completion Date', 'index': 7...",0.898115,area,0.857789,0.770393,3
3,"(8, 9, 13)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.770414,bar,0.999954,0.770379,3
4,"(4, 8, 13)","[{'name': 'US Permanent Resident', 'index': 4,...",0.839053,area,0.910792,0.764203,3
5,"(3, 8, 13)","[{'name': 'US Citizen', 'index': 3, 'type': 'n...",0.856015,area,0.88045,0.753679,3
6,"(8, 9, 10)","[{'name': 'Transfer Student', 'index': 8, 'typ...",0.749636,bar,0.999961,0.749607,3
7,"(5, 8, 13)","[{'name': 'Year', 'index': 5, 'type': 'nominal...",0.849381,area,0.87433,0.742639,3
8,"(4, 8, 11)","[{'name': 'US Permanent Resident', 'index': 4,...",0.84043,area,0.872863,0.733581,3
9,"(6, 8, 13)","[{'name': 'GPA', 'index': 6, 'type': 'nominal'...",0.854868,area,0.849246,0.725993,3


In [90]:
recommended_charts.iloc[0].fields

[{'name': 'Transfer Student', 'index': 8, 'type': 'nominal'},
 {'name': 'Veteran', 'index': 9, 'type': 'nominal'},
 {'name': 'Male parent/guardian highest educational degree',
  'index': 11,
  'type': 'nominal'}]

In [80]:
sample_fields = [{'name': 'year', 'index': 1, 'type': 'nominal'},
 {'name': 'fertility', 'index': 2, 'type': 'quantitative'},
 {'name': 'country', 'index': 6, 'type': 'nominal'}]

In [99]:
## select the top chart and render it by VegaLiteRender 
recommend_chart = recommended_charts.iloc[50]
vr = VegaLiteRender(chart_type = recommend_chart['chart_type'], columns = recommend_chart['fields'], data = chartRecommender.df.to_dict('records'))
# vr = VegaLiteRender(chart_type = 'scatter', columns = recommend_chart['fields'], data = chartRecommender.df.to_dict('records'))
# vr = VegaLiteRender(chart_type = 'scatter', columns = sample_fields, data = chartRecommender.df.to_dict('records'))


chart_display = alt.Chart.from_dict(vr.vSpec)
chart_display.display()

[{'name': 'US Permanent Resident', 'index': 4, 'type': 'nominal'}, {'name': 'Transfer Student', 'index': 8, 'type': 'nominal'}, {'name': 'Ethnicity (optional)', 'index': 10, 'type': 'nominal'}]
{'y': {'aggregate': 'count', 'type': 'quantitative'}, 'x': {'field': 'US Permanent Resident', 'type': 'nominal'}, 'color': {'field': 'Transfer Student', 'type': 'nominal'}, 'column': {'field': 'Ethnicity (optional)', 'type': 'nominal'}}


In [40]:
with open('sample_chart.json', 'w') as f:
    f.write(chart_display.to_json())

## MV Recommender
Return a MV.
- a MV is describled as a list of charts (corresponding to each record in the above charts_df)
- current_mv: optional. 
- max_charts: number of charts in the returned MV

In [68]:
## load model
mv_model = ScoreNetLSTM(input_size=9, seq_length = 12).to(gpu)
mv_model.load_state_dict(torch.load('trainedModel/mvModel.pt', map_location=gpu))
mv_model.eval()

ScoreNetLSTM(
  (lstm): LSTM(9, 200, batch_first=True)
  (linear): Linear(in_features=2400, out_features=1, bias=True)
)

In [69]:
chartRecommender = ChartRecommender(df, 
                                    word_embedding_dict, column_score_model, chart_type_model)

In [70]:
## Recommending an MV conditioned on current_mv
current_mv = [{'indices': (1,), 'chart_type': 'pie'}]
chartRecommender.recommend_mv(mv_model, current_mv = current_mv, max_charts = len(current_mv) + 1)

[{'indices': (1,), 'chart_type': 'pie'},
 {'indices': (0, 1, 2),
  'fields': [{'name': 'date', 'index': 0, 'type': 'temporal'},
   {'name': 'precipitation', 'index': 1, 'type': 'quantitative'},
   {'name': 'temp_max', 'index': 2, 'type': 'quantitative'}],
  'column_selection_score': 0.6392783051902111,
  'chart_type': 'scatter',
  'chart_type_prob': 1.393182937052802e-33,
  'final_score': 8.90631626819036e-34,
  'n_column': 3}]

In [71]:
len(chartRecommender.charts)

145

In [72]:
## Recommending an MV without conditions
recommended_charts = chartRecommender.recommend_mv(mv_model, current_mv = [], max_charts = 4)
recommended_charts

[{'indices': (0, 1, 2),
  'fields': [{'name': 'date', 'index': 0, 'type': 'temporal'},
   {'name': 'precipitation', 'index': 1, 'type': 'quantitative'},
   {'name': 'temp_max', 'index': 2, 'type': 'quantitative'}],
  'column_selection_score': 0.6392783051902111,
  'chart_type': 'bar',
  'chart_type_prob': 1.5659515933202327e-08,
  'final_score': 1.0010788805876691e-08,
  'n_column': 3},
 {'indices': (0, 1, 3),
  'fields': [{'name': 'date', 'index': 0, 'type': 'temporal'},
   {'name': 'precipitation', 'index': 1, 'type': 'quantitative'},
   {'name': 'temp_min', 'index': 3, 'type': 'quantitative'}],
  'column_selection_score': 0.6264488889318648,
  'chart_type': 'scatter',
  'chart_type_prob': 2.3942730420211285e-33,
  'final_score': 1.499889686973652e-33,
  'n_column': 3},
 {'indices': (0, 1, 4),
  'fields': [{'name': 'date', 'index': 0, 'type': 'temporal'},
   {'name': 'precipitation', 'index': 1, 'type': 'quantitative'},
   {'name': 'wind', 'index': 4, 'type': 'quantitative'}],
  'col

In [73]:
## select the top chart and render it by VegaLiteRender 

for recommend_chart in recommended_charts:
# recommend_chart = recommended_charts[3]
    vr = VegaLiteRender(chart_type = recommend_chart['chart_type'], columns = recommend_chart['fields'], data = chartRecommender.df.to_dict('records'))
    alt.Chart.from_dict(vr.vSpec).display()

[{'name': 'date', 'index': 0, 'type': 'temporal'}, {'name': 'precipitation', 'index': 1, 'type': 'quantitative'}, {'name': 'temp_max', 'index': 2, 'type': 'quantitative'}]
{'x': {'field': 'date', 'type': 'temporal'}, 'y': {'field': 'precipitation', 'type': 'quantitative'}, 'size': {'field': 'temp_max', 'type': 'quantitative'}}


[{'name': 'date', 'index': 0, 'type': 'temporal'}, {'name': 'precipitation', 'index': 1, 'type': 'quantitative'}, {'name': 'temp_min', 'index': 3, 'type': 'quantitative'}]
{'x': {'field': 'date', 'type': 'temporal'}, 'y': {'field': 'precipitation', 'type': 'quantitative'}, 'size': {'field': 'temp_min', 'type': 'quantitative'}}


[{'name': 'date', 'index': 0, 'type': 'temporal'}, {'name': 'precipitation', 'index': 1, 'type': 'quantitative'}, {'name': 'wind', 'index': 4, 'type': 'quantitative'}]
{'x': {'field': 'date', 'type': 'temporal'}, 'y': {'field': 'precipitation', 'type': 'quantitative'}, 'size': {'field': 'wind', 'type': 'quantitative'}}


[{'name': 'date', 'index': 0, 'type': 'temporal'}, {'name': 'precipitation', 'index': 1, 'type': 'quantitative'}, {'name': 'weather', 'index': 5, 'type': 'nominal'}]
{'x': {'field': 'date', 'type': 'temporal'}, 'y': {'field': 'weather', 'type': 'nominal'}, 'size': {'field': 'precipitation', 'type': 'quantitative'}}
