In [1]:
import pandas as pd
import sqlalchemy as sq

Create a connection to the database. This will eventually create a .db file in the directory of the notebook once we write something to the connection.

In [20]:
engine = sq.create_engine('sqlite:///OKCdatabase.db')

Load the dataframes

In [3]:
#main dataframe of user answers
df_main = pd.read_csv('user_data_public.csv',index_col=None, dtype=str)

In [4]:
#dataframe of general questions
df_qs = pd.read_csv('question_data.csv',sep=';',index_col=None)
df_qs = df_qs.sort_values('N',ascending=False);

In [5]:
#dataframe of cognitive test questions
df_ts = pd.read_csv('test_items.csv',index_col=None)

Examining the questions set, we see that there are 5 types of questions:

1. Personality questions, denoted by columns named qXXX (where X is numeric)
2. Test questions, also denoted by columns named qXXX
3. User's match preferences, denoted by columns named lf_ZZZZ (where Z is alphabetic)
4. User attributes, denoted by columns named d_ZZZZ
5. OKC personality scale scores, denoted by columns named p_ZZZZ

Since different question types have different kinds of answers, we will break the main table into 5 smaller tables

In [7]:
#list of all of the question headers (sorted by number of responses):
headers_all = list(df_qs.iloc[:,0])
#list of headers that starts with q
headers_q = [name for name in headers_all if (name[0] == 'q')]
#list of headers that starts with If
headers_lf = [name for name in headers_all if (name[0] == 'l')]
#list of headers that starts with d
headers_d = [name for name in headers_all if (name[0] == 'd')]
#list of headers that starts with p
headers_p = [name for name in headers_all if (name[0] == 'p')]
#list of cognitive test question headers
headers_ts = list(df_ts.iloc[:,0])
#list of non-cognitive test question headers
headers_qns = [name for name in headers_q if (name not in headers_ts)]

In [8]:
#OKC Personality Scores Table:
tab_per = df_main.loc[:,headers_p]

In [9]:
#User Attributes Table:
tab_att = df_main.loc[:,headers_d]

In [10]:
#User Match Preferences Table:
tab_prf = df_main.loc[:,headers_lf]

In [11]:
#Test Questions Table:
tab_tst = df_main.loc[:,headers_ts]

The table to store other questions need to be broken up into smaller chunks. We'll just store the top 1000 in 5 tables, and use the first one for our MVP. We can add more features to [possibly] improve the model scores afterwards. 

In [12]:
#Other Questions Table (top 200):
tab_qn1 = df_main.loc[:,headers_qns[0:200]]

In [13]:
#Other Questions Table (200-400):
tab_qn2 = df_main.loc[:,headers_qns[200:400]]

In [14]:
#Other Questions Table (400-600):
tab_qn3 = df_main.loc[:,headers_qns[400:600]]

In [15]:
#Other Questions Table (600-800):
tab_qn4 = df_main.loc[:,headers_qns[600:800]]

In [16]:
#Other Questions Table (800-1000):
tab_qn5 = df_main.loc[:,headers_qns[800:1000]]

Table for the 2 questions keys tables

In [17]:
#Test Questions Key Table:
tab_keyq = df_qs
#Other Questions Key Table:
tab_keyt = df_ts

In [18]:
#Cognitive Score Table
tab_CA = df_main.CA

Now we write these tables into the SQL database connection set up by the 'create_engine' statement

In [21]:
tab_per.to_sql('personality_scores',engine,index=False)

In [22]:
tab_CA.to_sql('cognitive_scores',engine,index=False)

In [23]:
tab_att.to_sql('user_info',engine,index=False)

In [24]:
tab_prf.to_sql('match_preferences',engine,index=False)

In [25]:
tab_tst.to_sql('test_answers',engine,index=False)

In [26]:
tab_qn1.to_sql('question_responses1',engine,index=False)

In [27]:
tab_qn2.to_sql('question_responses2',engine,index=False)

In [28]:
tab_qn3.to_sql('question_responses3',engine,index=False)

In [29]:
tab_qn4.to_sql('question_responses4',engine,index=False)

In [30]:
tab_qn5.to_sql('question_responses5',engine,index=False)