# Creating subsets

### Importings

In [19]:
import pandas as pd
import matplotlib.pyplot as pyplot
import datetime

In [20]:
data = pd.read_csv('cleaned_data.csv', nrows=None)

In [21]:
original_length = len(data)
subset_length = 10000
#7500, 5000, 2500, 2000, 1500, 1000, 500, (300), 200, 100, 50
print(original_length)

1710660


# Create subsets and check if they are good enough

## Saving "expected" values. The number of rows from each call type

In [22]:
#finding the number of rows of each call type (A, B, C) in the original dataset
original_call_type_a_percentage = (len(data[data['CALL_TYPE']=='A'])/original_length)*100
original_call_type_b_percentage = (len(data[data['CALL_TYPE']=='B'])/original_length)*100
original_call_type_c_percentage = (len(data[data['CALL_TYPE']=='C'])/original_length)*100

print(original_call_type_a_percentage)
print(original_call_type_b_percentage)
print(original_call_type_c_percentage)



21.32329042591748
47.81066956613237
30.866040007950147


### Method to check wheter CALL_TYPE is evenly distributed compared to original dataset

In [23]:
def call_type_accepted(subset_call_type_a_percentage, subset_call_type_b_percentage, subset_call_type_c_percentage):
    if abs(original_call_type_a_percentage - subset_call_type_a_percentage) <= (1/3):
        if abs(original_call_type_b_percentage - subset_call_type_b_percentage) <= (1/3):
            if abs(original_call_type_c_percentage - subset_call_type_c_percentage) <= (1/3):
                return True
    return False

# TIMESTAMP

In [24]:
def get_info_from_timestamps(subset):
    hours_dict = {
        0:0,
        1:0,
        2:0,
        3:0,
        4:0,
        5:0,
        6:0,
        7:0,
        8:0,
        9:0,
        10:0,
        11:0,
        12:0,
        13:0,
        14:0,
        15:0,
        16:0,
        17:0,
        18:0,
        19:0,
        20:0,
        21:0,
        22:0,
        23:0
    }
    days_dict={
        "Monday":0,
        "Tuesday":0,
        "Wednesday":0,
        "Thursday":0,
        "Friday":0,
        "Saturday":0,
        "Sunday":0
    }
    months_dict={
        1:0,
        2:0,
        3:0,
        4:0,
        5:0,
        6:0,
        7:0,
        8:0,
        9:0,
        10:0,
        11:0,
        12:0
    }
    
    for row in subset['TIMESTAMP']:
        time = datetime.datetime.fromtimestamp(row)
        hours_dict[time.hour]+=1
        days_dict[time.strftime("%A")]+=1
        months_dict[time.month]+=1
    return hours_dict, days_dict, months_dict

### Saving the timestamp-info from original data

In [25]:
#finding the number of rows of each hour, weekday and month from timestamp in the original dataset
original_hours, original_weekdays, original_months = get_info_from_timestamps(data)

### Create method to print histogram for timestamps

In [26]:
def print_histogram_timestamps(hours_dict, days_dict, months_dict):
    keys_hours = list(hours_dict.keys())
    values_hours = list(hours_dict.values())
    pyplot.bar(keys_hours, values_hours)
    pyplot.title("HOURS")
    pyplot.show()

    keys_hours = list(original_hours.keys())
    values_hours = list(original_hours.values())
    pyplot.bar(keys_hours, values_hours)
    pyplot.title("ORIGINAL HOURS")
    pyplot.show()

    keys_days = list(days_dict.keys())
    values_days = list(days_dict.values())
    pyplot.bar(keys_days, values_days)
    pyplot.title("DAYS")
    pyplot.show()

    keys_days = list(original_weekdays.keys())
    values_days = list(original_weekdays.values())
    pyplot.bar(keys_days, values_days)
    pyplot.title("ORIGINAL DAYS")
    pyplot.show()

    keys_months = list(months_dict.keys())
    values_months = list(months_dict.values())
    pyplot.bar(keys_months, values_months)
    pyplot.title("MONTHS")
    pyplot.show()

    keys_months = list(original_months.keys())
    values_months = list(original_months.values())
    pyplot.bar(keys_months, values_months)
    pyplot.title("ORIGINAL MONTHS")
    pyplot.show()

### Convert original timestamps-dicts into percentage dicts

In [27]:
original_hours_percentage_dict = {}
for hour in original_hours:
    original_hours_percentage_dict[hour] = (original_hours[hour]/original_length)*100

original_weekdays_percentage_dict = {}
for weekday in original_weekdays:
    original_weekdays_percentage_dict[weekday] = (original_weekdays[weekday]/original_length)*100

original_months_percentage_dict = {}
for month in original_months:
    original_months_percentage_dict[month] = (original_months[month]/original_length)*100

In [28]:
print(original_hours_percentage_dict)
print(original_weekdays_percentage_dict)
print(original_months_percentage_dict)

{0: 2.9285188172985865, 1: 2.900927127541417, 2: 2.915892111816492, 3: 2.8807010159821353, 4: 2.787052950323267, 5: 3.5662843580840144, 6: 3.299837489623888, 7: 3.0753042685279364, 8: 3.1736873487425905, 9: 4.815217518384717, 10: 5.789402920510212, 11: 5.576151894590392, 12: 5.183671799188618, 13: 4.924415138016905, 14: 4.816620485660506, 15: 5.493201454409409, 16: 5.2815287666748505, 17: 5.378567336583541, 18: 5.100136789309389, 19: 4.725719897583389, 20: 4.327277191259514, 21: 4.086200647703225, 22: 3.699624706253726, 23: 3.2740579659312776}
{'Monday': 13.3881075140589, 'Tuesday': 13.737797107549133, 'Wednesday': 13.536295932564041, 'Thursday': 14.373341283481231, 'Friday': 15.907485999555727, 'Saturday': 15.09844153718448, 'Sunday': 13.95853062560649}
{1: 7.656460079735307, 2: 7.590286789893959, 3: 8.12534343469772, 4: 7.9996024926051925, 5: 9.462312791554137, 6: 8.969228251084377, 7: 8.51414074100055, 8: 7.351606982100475, 9: 8.62982708428326, 10: 8.973378695941916, 11: 8.182748179

### Creating functions for checking if hours, weekdays and months are evenly distributed compared to original dataset

In [29]:
def weekdays_accepted(subset_weekdays_dict):
    for weekday in subset_weekdays_dict:
        subset_percentage = (subset_weekdays_dict[weekday]/subset_length)*100
        diff = abs(original_weekdays_percentage_dict[weekday]-subset_percentage)
        if diff>(1/7):
            return False
    return True
            
def months_accepted(subset_months_dict):
    for month in subset_months_dict:
        subset_percentage = (subset_months_dict[month]/subset_length)*100
        diff = abs(original_months_percentage_dict[month]-subset_percentage)
        if diff>(1/12):
            return False
    return True

def hours_accepted(subset_hours_dict):
    for hour in subset_hours_dict:
        subset_percentage = (subset_hours_dict[hour]/subset_length)*100
        diff = abs(original_hours_percentage_dict[hour]-subset_percentage)
        if diff>(1/24):
            return False
    return True

###

### Printing the histograms and calculating the percentage-change of each variable, both for the original dataset and the subset.

CALL_TYPE:

In [30]:
def print_histogram(column, subset):
    data_sorted = data.sort_values(by=column)
    subset_sorted = subset.sort_values(by=column)

    pyplot.hist(data_sorted[column], bins='auto')
    pyplot.xlabel(column)
    pyplot.ylabel('number of rows')
    pyplot.title('Original dataset. ' + column)
    pyplot.show()

    pyplot.hist(subset_sorted[column], bins='auto')
    pyplot.xlabel(column)
    pyplot.ylabel('number of rows')
    pyplot.title('Subset1, ' + column)
    pyplot.show()

In [31]:
subset_ok = False
counter = 1
while subset_ok==False:
    #create subset
    subset = data.sample(n=subset_length)

    print("Starter forsøk " + str(counter))
    counter+=1

    #CALL_TYPE
    subset_call_type_a_percentage = (len(subset[subset['CALL_TYPE']=='A'])/subset_length)*100
    subset_call_type_b_percentage = (len(subset[subset['CALL_TYPE']=='B'])/subset_length)*100
    subset_call_type_c_percentage = (len(subset[subset['CALL_TYPE']=='C'])/subset_length)*100


    if call_type_accepted(subset_call_type_a_percentage, subset_call_type_b_percentage, subset_call_type_c_percentage):
        print("CALL_TYPE is accepted.")

        #TIMESTAMP
        subset_hours_dict, subset_weekdays_dict, subset_months_dict = get_info_from_timestamps(subset)
        #weekdays
        if weekdays_accepted(subset_weekdays_dict):
            print("Weekdays accepted")
            if months_accepted(subset_months_dict):
                print("Months accepted")
                if hours_accepted(subset_hours_dict):
                    print("Hours accepted")
                    subset_ok=True
                    subset.to_csv('subset-10000.csv', index=False)
                    print_histogram("CALL_TYPE", subset)
                    hours, days, months = get_info_from_timestamps(subset)
                    print_histogram_timestamps(hours, days, months)


Starter forsøk 1
Starter forsøk 2
Starter forsøk 3


Starter forsøk 4
Starter forsøk 5
Starter forsøk 6
CALL_TYPE is accepted.
Starter forsøk 7
CALL_TYPE is accepted.
Starter forsøk 8
Starter forsøk 9
Starter forsøk 10
Starter forsøk 11
Starter forsøk 12
Starter forsøk 13
Starter forsøk 14
Starter forsøk 15
Starter forsøk 16
CALL_TYPE is accepted.
Starter forsøk 17
CALL_TYPE is accepted.
Starter forsøk 18
Starter forsøk 19
CALL_TYPE is accepted.
Starter forsøk 20
CALL_TYPE is accepted.
Starter forsøk 21
Starter forsøk 22
Starter forsøk 23
Starter forsøk 24
Starter forsøk 25
CALL_TYPE is accepted.
Starter forsøk 26
Starter forsøk 27
Starter forsøk 28
CALL_TYPE is accepted.
Starter forsøk 29
Starter forsøk 30
Starter forsøk 31
Starter forsøk 32
CALL_TYPE is accepted.
Starter forsøk 33
CALL_TYPE is accepted.
Starter forsøk 34
Starter forsøk 35
Starter forsøk 36
Starter forsøk 37
Starter forsøk 38
Starter forsøk 39
CALL_TYPE is accepted.
Starter forsøk 40
Starter forsøk 41
Starter forsøk 42
CALL_TYPE is accepted.
Starter forsøk 43
Starter fo

KeyboardInterrupt: 