In [21]:
import pandas as pd

df = pd.read_csv('prepared_data.csv')


# Find the modes / means of prepared data for default form values

In [22]:
mode_values = df.mode().iloc[0]

In [23]:
numeric_cols = {
    "Financial Aid Offered Amount": "Financial Aid Offered Amount",
    "Counselor Incoming Text Count": "incoming_text_count",
    "Counselor Outgoing Text Count": "outgoing_text_count",
    "Phone Successful Count": "phone_successful_count",
    "Phone Unsuccessful Count": "phone_unsuccessful_count",
    "Phone Voicemail Count": "phone_voicemail_count",
    "Events Attended Count": "Events Attended Count"
}


In [24]:
def modify_mode(column, value):
    if value == 0.0:
        if column in numeric_cols.values():
            return round(df[column][df[column] != 0.0].mean(), 1)
        else:
            return 0.0
    elif value == 'Y':
        return 1.0
    elif value == 'N':
        return 0.0
    else:
        return value

In [25]:
modified_modes = mode_values.index.to_series().apply(lambda column: modify_mode(column, mode_values[column]))

In [26]:
modified_modes

State                                                               OK
Country                                                            USA
Gender                                                               F
Ethnicity                                                 Not Declared
Origin Source                                              Falls Creek
Student Type                                       First-Time Freshman
Major                                                        Undecided
Financial Aid Offered Amount                                   24724.1
Athlete                                                   Not Declared
Sport                                                     Not Declared
Raley College Tag Exists                                           1.0
Recruiting Territory                                     Raley College
Counselor                                                          C11
ID                                    00004c31ce07c22148ee37acd0f814b9
incomi

# Data Exploration

In [27]:
col = "Enrolled"
total_enrolled = int(df[col].count())
total_no = int(df[col][df[col] == "N"].count())
total_yes = int(df[col][df[col] == "Y"].count())
print(total_enrolled)
print(total_no)
print(total_yes)
print(f"Math check: {total_enrolled == (total_no + total_yes)}")

63712
62800
912
Math check: True


In [28]:
col = "Raley College Tag Exists"
total_enrolled = int(df[col].count())
total_no = int(df[col][df[col] == "Not Declared"].count())
total_yes = int(df[col][df[col] == "Y"].count())
print(total_enrolled)
print(total_no)
print(total_yes)
print(f"Math check: {total_enrolled == (total_no + total_yes)}")

63712
24660
39052
Math check: True


In [29]:
col = "Recruiting Territory"
acc = 0
total = df[col].count()
types = df[col].unique()
for type in types:
    count = df[col][df[col] == type].count()
    acc += count
    print(f"Type: {type}, count: {count}")
print(f"Math check: {total == acc}")

Type: T01, count: 2434
Type: Athlete, count: 462
Type: P02, count: 1531
Type: Homeschool, count: 274
Type: T04, count: 4482
Type: T03, count: 2994
Type: T02, count: 2212
Type: T06, count: 2888
Type: T07, count: 5034
Type: T05, count: 2733
Type: P01, count: 554
Type: Graduate, count: 3
Type: T00, count: 650
Type: Raley College, count: 37461
Math check: True
