In [1]:
from datetime import datetime, timedelta
import os
import configparser
import requests
import pandas as pd
import numpy as np
import boto3
import io

In [2]:
# Read config files to get file locations
config = configparser.RawConfigParser()
config.read('../configs/param.cfg')

# Raw data
path_question = config['FILE_LOCATION'][f'loc_so_question']
path_question_tag = config['FILE_LOCATION'][f'loc_so_question_tag']

# Output data
path_dim_date = config['FILE_LOCATION']['loc_dim_date']
path_dim_tag = config['FILE_LOCATION']['loc_dim_tag']
path_fact_question = config['FILE_LOCATION']['loc_fact_question']
path_fact_question_tag = config['FILE_LOCATION']['loc_fact_question_tag']

In [3]:
# Read downloaded questions file as DataFrame
df_question = pd.read_csv(path_question, compression='zip')

# Question file profiling
desc = df_question.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

                    Id         Score     OwnerUserId AnswerCount
count         17203824      17203824        13088517    15463823
mean   22655681.906628      1.204167  2274603.883636    1.417859
std    11753988.576403     15.645736  1911360.680041    1.498361
min           1.000000   -154.000000        1.000000   -5.000000
25%    12841969.750000      0.000000   692377.000000    1.000000
50%    23316485.500000      0.000000  1697528.000000    1.000000
75%    32817102.250000      1.000000  3546476.000000    2.000000
max    42069071.000000  16902.000000  7520803.000000  518.000000


In [4]:
# Read downloaded question_tags file as DataFrame
df_question_tag = pd.read_csv(path_question_tag, compression='zip')

# Question_tags file profiling
desc = df_question_tag.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

                    Id
count         50576842
mean   22799501.591920
std    11701022.581574
min           1.000000
25%    13074746.000000
50%    23503290.000000
75%    32885768.750000
max    42069071.000000


In [6]:
# Read dimDate as DataFrame
df_dim_date = pd.read_csv(path_dim_date)

# dimDate profiling
desc = df_dim_date.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

        Unnamed: 0        day   weekday      month   quarter         year
count         3112       3112      3112       3112      3112         3112
mean   1555.500000  15.714332  3.000000   6.628535  2.544023  2012.330656
std     898.501345   8.815642  2.000161   3.491027  1.125944     2.478370
min       0.000000   1.000000  0.000000   1.000000  1.000000  2008.000000
25%     777.750000   8.000000  1.000000   4.000000  2.000000  2010.000000
50%    1555.500000  16.000000  3.000000   7.000000  3.000000  2012.000000
75%    2333.250000  23.000000  5.000000  10.000000  4.000000  2014.000000
max    3111.000000  31.000000  6.000000  12.000000  4.000000  2017.000000


In [7]:
# Read dimTag as DataFrame
df_dim_tag = pd.read_csv(path_dim_tag)

# dimTag profiling
desc = df_dim_tag.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

         Unnamed: 0         TagID
count         20993         20993
mean   10496.000000  10496.000000
std     6060.301436   6060.301436
min        0.000000      0.000000
25%     5248.000000   5248.000000
50%    10496.000000  10496.000000
75%    15744.000000  15744.000000
max    20992.000000  20992.000000


In [8]:
# Read factQuestion as DataFrame
df_fact_question = pd.read_csv(path_fact_question)

# factQuestion profiling
desc = df_fact_question.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

          Unnamed: 0   Unnamed: 0.1              Id         Score  \
count        1000000        1000000         1000000       1000000   
mean   499999.500000  499999.500000  2037626.596164      7.190304   
std    288675.278932  288675.278932  1033195.976199     53.045970   
min         0.000000       0.000000        1.000000    -73.000000   
25%    249999.750000  249999.750000  1186752.250000      0.000000   
50%    499999.500000  499999.500000  2113644.000000      1.000000   
75%    749999.250000  749999.250000  2934619.250000      4.000000   
max    999999.000000  999999.000000  3705576.000000  12968.000000   

          OwnerUserId AnswerCount                     hash_key  
count          815443      956609                      1000000  
mean    185260.235396    2.843447   9222765394941554688.000000  
std     237989.582327    3.309818   5323723884527627264.000000  
min          1.000000   -5.000000         4448756865451.000000  
25%      54964.000000    1.000000   46120571594805360

In [10]:
# Read factQuestionTag as DataFrame
df_fact_question_tag = pd.read_csv(path_fact_question_tag)

# factQuestionTag profiling
desc = df_fact_question_tag.describe()
desc.loc['count'] = desc.loc['count'].astype(int).astype(str)
desc.iloc[1:] = desc.iloc[1:].applymap('{:.6f}'.format)
print(desc)

          Unnamed: 0   Unnamed: 0.1              Id  \
count        1000000        1000000         1000000   
mean   499999.500000  499999.500000   862725.535469   
std    288675.278932  288675.278932   442611.999472   
min         0.000000       0.000000        1.000000   
25%    249999.750000  249999.750000   496695.750000   
50%    499999.500000  499999.500000   889155.000000   
75%    749999.250000  749999.250000  1247532.000000   
max    999999.000000  999999.000000  1578386.000000   

                          hash_key         TagID  
count                      1000000       1000000  
mean    9218010219384241152.000000   1645.099865  
std     5321647588159413248.000000   3137.369669  
min          40246445872800.000000      0.000000  
25%     4607302913044857856.000000     80.000000  
50%     9213508065146435584.000000    392.000000  
75%    13822761688794650624.000000   1501.000000  
max    18446743771010238464.000000  20992.000000  
