In [50]:
import pandas as pd
import numpy as np

# a) Generate a Dataset

Randomly generate a dataset (dataframe) with eight columns and 50,000 rows. Each column should be a categorical variable (of arbitrary name) with three levels (of arbitrary names) in approximately equal proportions.

## i. Random Generation

In [51]:
examples=50000
features=8
num_categories=3

#Encode each of the three categories as integers
#Generate random floats from 0-1. Multiply by num_categories to get random numbers over the right range, then round down to get integers.
rand_arr = np.floor(np.random.rand(examples,features)*num_categories).astype(int)
rand_arr

array([[2, 2, 0, ..., 1, 2, 2],
       [0, 1, 2, ..., 1, 0, 1],
       [0, 0, 1, ..., 1, 1, 1],
       ...,
       [2, 0, 2, ..., 1, 2, 1],
       [0, 0, 1, ..., 0, 0, 2],
       [1, 0, 0, ..., 0, 0, 2]])

## ii. Convert to a Pandas Dataframe

In [52]:
df = pd.DataFrame(rand_arr)

#Rename the columns for clarity
col_numbers = range(0,8)
col_names = [f"Column {col}" for col in col_numbers]
#generate a dictionary of column numbers and names to pass to the dataframe rename method
columns_rename = {number:name for (number,name) in zip(col_numbers, col_names)}
df.rename(columns=columns_rename, inplace=True)

df


Unnamed: 0,Column 0,Column 1,Column 2,Column 3,Column 4,Column 5,Column 6,Column 7
0,2,2,0,0,0,1,2,2
1,0,1,2,1,1,1,0,1
2,0,0,1,2,2,1,1,1
3,2,0,1,0,1,2,0,0
4,1,0,1,2,1,1,1,1
...,...,...,...,...,...,...,...,...
49995,2,2,0,0,2,2,2,0
49996,0,1,2,2,1,1,1,1
49997,2,0,2,1,1,1,2,1
49998,0,0,1,2,0,0,0,2


# b) Verify Distribution

Veryify that the proportions of each value are similar for each of the eight columns.

In [53]:
df_counts = pd.DataFrame()

#for each of the integer encoded labels, create a series with the number of instances of that label for each column and append that to a new dataframe
for val in range(0,num_categories):
    df_counts = df_counts.append(df[df == val].count(), ignore_index=True)

df_counts.index.set_names('Category Label', inplace=True)
print("Count of Category Labels by Column:")
df_counts

Count of Category Labels by Column:


Unnamed: 0_level_0,Column 0,Column 1,Column 2,Column 3,Column 4,Column 5,Column 6,Column 7
Category Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,16664.0,16542.0,16566.0,16857.0,16536.0,16598.0,16757.0,16827.0
1,16651.0,16736.0,16627.0,16509.0,16752.0,16791.0,16562.0,16706.0
2,16685.0,16722.0,16807.0,16634.0,16712.0,16611.0,16681.0,16467.0


The proportions of each label are are similar for each column: within expectations of what we would see from random generation.

# C) Unique Combinations

How many unique rows (i.e., permutations of category levels) are possible?

In [54]:
print("There are {} unique permutations.".format(num_categories**features))

There are 6561 unique permutations.


# d) Permutation Frequency

Produce a table and appropriate graph which show the frequencies (numbers of groups) by permutation group sizes up to group size of 12. That is, how many groups are unique combinations (group size = 1), how many groups are made up of a pair of matching combinations (group size = 2), how many groups are made up three the same, etc?


In [55]:
#Get a count of the number of instaces of each permutation
s_perm_frequency = df.pivot_table(index=col_names, aggfunc='size').rename("Count")
s_perm_frequency


Column 0  Column 1  Column 2  Column 3  Column 4  Column 5  Column 6  Column 7
0         0         0         0         0         0         0         0            6
                                                                      1           10
                                                                      2            8
                                                            1         0            6
                                                                      1            3
                                                                                  ..
2         2         2         2         2         2         1         1            6
                                                                      2            7
                                                            2         0           10
                                                                      1            7
                                                                      2

In [65]:
#Group by the permutation frequency and count them
s_permutation_count = s_perm_frequency.value_counts().sort_index()
#convert to dataframe
df_permutation_count = s_permutation_count.to_frame()
#name the index
df_permutation_count.index.set_names("Group Size", inplace=True)
# Cut this off after group size of 12
max_group_size = 12
df_permutation_count = df_permutation_count.loc[range(1,max_group_size+1)]
df_permutation_count

Unnamed: 0_level_0,Count
Group Size,Unnamed: 1_level_1
1,31
2,86
3,229
4,467
5,651
6,877
7,930
8,960
9,794
10,570


In [75]:
import plotly.express as px

px.line(df_permutation_count.reset_index(), x = "Group Size", y = "Count", title="Unique Permuations")