In [1]:
import pandas as pd
import numpy as np

## DF Final Experiment Clients

In [2]:
# load df_final_experiment_clients.txt
data_clients = pd.read_csv('../files_for_project/df_final_experiment_clients.txt', sep=",", header=None)
data_clients.columns = ["client_id", "variation"]

In [3]:
data_clients = data_clients.drop(index=0)

In [4]:
data_clients = data_clients.rename(columns={'Variation': 'variation'})

In [5]:
# display the first few rows of the dataset
data_clients.head()

Unnamed: 0,client_id,variation
1,9988021,Test
2,8320017,Test
3,4033851,Control
4,1982004,Test
5,9294070,Control


In [6]:
# retrieving the number of rows and columns in the dataframe
data_clients.shape

(70609, 2)

In [7]:
# displaying the data types of each column in the dataframe
data_clients.dtypes

client_id    object
variation    object
dtype: object

In [8]:
# retrieving the unique data types present in the dataframe columns
list(set(data_clients.dtypes.tolist()))

[dtype('O')]

In [9]:
# separating between discrete and continuous variables, as discrete ones could potentially be treated as categorical.
# remember to adjust the threshold (in this case, < 20) based on your dataset's specific characteristics and domain knowledge.
potential_categorical_from_numerical = data_clients.select_dtypes("number").loc[:, data_clients.select_dtypes("number").nunique() < 20]
potential_categorical_from_numerical

1
2
3
4
5
...
70605
70606
70607
70608
70609


In [10]:
# retrieving column names with object (typically string) data types from the dataframe
data_clients.select_dtypes("object").columns

Index(['client_id', 'variation'], dtype='object')

In [11]:
# counting and sorting the unique values for each object (string) column in descending order
data_clients.select_dtypes("object").nunique().sort_values(ascending=False)

# all columns seem categorical, as there isn't a wide variability of values.

client_id    70609
variation        2
dtype: int64

In [12]:
# extracting columns with object (typically string) data types to create a categorical dataframe
# for demonstration purposes, let's consider the columns in potential_categorical_from_numerical as categorical variables.
data_clients_categorical = pd.concat([data_clients.select_dtypes("object"), potential_categorical_from_numerical], axis=1)
print(data_clients_categorical) 

      client_id variation
1       9988021      Test
2       8320017      Test
3       4033851   Control
4       1982004      Test
5       9294070   Control
...         ...       ...
70605   2443347       NaN
70606   8788427       NaN
70607    266828       NaN
70608   1266421       NaN
70609   9895983       NaN

[70609 rows x 2 columns]


In [13]:
# verifying that the total number of columns in the dataframe is the sum of object (string) and numerical columns
len(data_clients.columns) == len(data_clients.select_dtypes("object").columns) + len(data_clients.select_dtypes("number").columns)

True

In [14]:
# checking for missing data
data_clients.isnull().sum().sort_values(ascending=False)
data_clients.shape

(70609, 2)

In [15]:
# identifying columns in the dataframe where over 80% of the values are missing
data_clients.columns[data_clients.isnull().mean() > 0.8]

Index([], dtype='object')

In [16]:
# filtering out columns in the dataframe where more than 80% of the values are missing
data_clients = data_clients[data_clients.columns[data_clients.isnull().mean() < 0.8]]
data_clients.shape

(70609, 2)

In [17]:
# removing the "client_id" column from the dataframe
# data_clients.drop("client_id", inplace=True, axis=1)

In [18]:
# frequency table for "variation"
frequency_table = data_clients["variation"].value_counts()

# calculating the proportion of each unique value in the "variation"
proportion_table = data_clients["variation"].value_counts(normalize=True)

frequency_table, proportion_table

(variation
 Test       26968
 Control    23532
 Name: count, dtype: int64,
 variation
 Test       0.53402
 Control    0.46598
 Name: proportion, dtype: float64)

In [19]:
# creating a crosstab table for the 'variation' column, counting occurrences for each unique value
my_table = pd.crosstab(index = data_clients_categorical["variation"],  # Make a crosstab
                              columns="count")      # Name the count column
my_table

col_0,count
variation,Unnamed: 1_level_1
Control,23532
Test,26968


In [20]:
# calculating the proportions for each value in 'my_table' and rounding the results to two decimal places
(my_table/my_table.sum()).round(2)

col_0,count
variation,Unnamed: 1_level_1
Control,0.47
Test,0.53


In [None]:
data_clients.to_csv('../files_for_project/df_final_experiment_clients.csv', index=False)