In [2]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# save dataset as data
data = pd.read_csv("[Fillout] METU and Career Expectation Survey results.csv")

In [4]:
# Rename column headers
data.rename(columns = {
    "Submission ID": "submission_id",
    "Last updated": "last_updated",
    "Submission started": "submission_started",
    "Status": "status",
    "Current step": "current_step",
    "How old are you?": "age",
    "What is your gender?": "gender",
    "What is your department?": "department",
    "What is your current academic grade?": "academic_year",
    "Where do you currently live?": "current_residence",
    "Where do you prefer to study the most?": "study_preference",
    "What is your GPA?": "gpa",
    "On average, how many hours of sleep do you get per night?": "sleep_hours",
    "On average, how many hours per week do you spend on sports or physical activities?(Please enter a number)": "sport_hours",
    "How many cigarettes do you smoke on average per day?(If you don't smoke, please put 0)": "cigarettes_per_day",
    "How many cups of coffee do you drink per day?(Please enter a number)": "coffee_cups",
    "Please rate your department satisfaction": "department_satisfaction",
    "What are the biggest obstacles to your academic and/or professional development?": "development_obstacles",
    "How long do you expect it will take you to find a job after graduation?": "job_search_duration",
    "Do you plan to pursue further education after graduation?": "higher_education",
    "What type of work arrangement do you prefer after graduation?": "work_style",
    "Where do you plan to work after graduation?": "country_preference",
    "Do you plan to work in the public sector or the private sector after graduation?": "sector_preference",
    "Do you plan to work in your department’s field or in a different sector after graduation?": "field_preference",
    "In which city would you like to work after graduation?": "city_preference",
    "What is your expected monthly salary after graduation? (in thousands of Turkish Liras, e.g., 30 for 30.000 TRY)": "salary_expectation",
    "Errors": "errors",
    "Url": "url",
    "Network ID": "network_id"
}, inplace = True)

In [5]:
# drop unnecessary columns
df = data.drop(['last_updated', 'submission_started', 'status',
           'current_step', 'errors', 'url', 'network_id'], axis=1) 

In [6]:
# Data is ordered from the last survey to the first. Flip the data
df = df.sort_index(ascending=False).reset_index(drop=True).drop("submission_id", axis=1)
# Now the data is ordered from the first survey to the last.

In [7]:
df.groupby("department")["gender"].value_counts()

department                                 gender           
Aerospace Engineering\n                    Female                2
                                           Male\n                2
Chemical Engineering\n                     Female                1
                                           Male\n                1
Civil Engineering                          Male\n               13
                                           Female                4
Computer Engineering                       Male\n                7
                                           Female                1
Electrical and Electronics Engineering\n   Male\n                7
                                           Female                4
                                           Prefer not to say     1
Environmental Engineering                  Female                1
Food Engineering\n                         Female                2
Industrial Engineering                     Male\n               11
 

In [8]:
df.columns

Index(['age', 'gender', 'department', 'academic_year', 'current_residence',
       'study_preference', 'gpa', 'sleep_hours', 'sport_hours',
       'cigarettes_per_day', 'coffee_cups', 'department_satisfaction',
       'development_obstacles', 'job_search_duration', 'higher_education',
       'work_style', 'country_preference', 'sector_preference',
       'field_preference', 'city_preference', 'salary_expectation'],
      dtype='object')

In [9]:
# Check the dtypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      90 non-null     int64  
 1   gender                   90 non-null     object 
 2   department               90 non-null     object 
 3   academic_year            90 non-null     object 
 4   current_residence        90 non-null     object 
 5   study_preference         90 non-null     object 
 6   gpa                      90 non-null     object 
 7   sleep_hours              90 non-null     object 
 8   sport_hours              88 non-null     float64
 9   cigarettes_per_day       89 non-null     float64
 10  coffee_cups              87 non-null     float64
 11  department_satisfaction  90 non-null     int64  
 12  development_obstacles    76 non-null     object 
 13  job_search_duration      90 non-null     object 
 14  higher_education         90 

In [10]:
# Correct the data types
conversion_dic = {
    "gender": "category",
    "department": "category",
    "academic_year": "category",
    "current_residence": "category",
    "study_preference": "category",
    "gpa": "category",
    "sleep_hours": "category",
    "job_search_duration": "category",
    "higher_education": "category",
    "work_style": "category",
    "country_preference": "category",
    "sector_preference": "category",
    "field_preference": "category",
}

df = df.astype(conversion_dic)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      90 non-null     int64   
 1   gender                   90 non-null     category
 2   department               90 non-null     category
 3   academic_year            90 non-null     category
 4   current_residence        90 non-null     category
 5   study_preference         90 non-null     category
 6   gpa                      90 non-null     category
 7   sleep_hours              90 non-null     category
 8   sport_hours              88 non-null     float64 
 9   cigarettes_per_day       89 non-null     float64 
 10  coffee_cups              87 non-null     float64 
 11  department_satisfaction  90 non-null     int64   
 12  development_obstacles    76 non-null     object  
 13  job_search_duration      90 non-null     category
 14  higher_educa

In [11]:
anomality_dic = {} 
for i, col in enumerate(df.select_dtypes(["category", "object"])):
    if df[col].str.contains("\n").any():
        anomality_dic[col] = True
    else:
        anomality_dic[col] = False

anomality_dic

{'gender': True,
 'department': True,
 'academic_year': False,
 'current_residence': True,
 'study_preference': True,
 'gpa': False,
 'sleep_hours': True,
 'development_obstacles': True,
 'job_search_duration': True,
 'higher_education': False,
 'work_style': False,
 'country_preference': True,
 'sector_preference': False,
 'field_preference': False,
 'city_preference': False}

In [12]:
for key, value in anomality_dic.items():
    if value == True & (key != "development_obstacles" | key != "city_preference"):
        df[key] = df[key].str.rstrip("\n").astype("category")

In [13]:
# Check if the dataframe is clean of \n's
check_list = []

for i, col in enumerate(df.select_dtypes(["category", "object"])):
    if col == "development_obstacles": # devolepment_obstacle is not important
        continue
    if df[col].str.contains("\n").any():
        check_list.append(True)
    else:
        check_list.append(False)

any(check_list) 

False

In [14]:
df.to_csv("Cleaned Data.csv")

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype   
---  ------                   --------------  -----   
 0   age                      90 non-null     int64   
 1   gender                   90 non-null     category
 2   department               90 non-null     category
 3   academic_year            90 non-null     category
 4   current_residence        90 non-null     category
 5   study_preference         90 non-null     category
 6   gpa                      90 non-null     category
 7   sleep_hours              90 non-null     category
 8   sport_hours              88 non-null     float64 
 9   cigarettes_per_day       89 non-null     float64 
 10  coffee_cups              87 non-null     float64 
 11  department_satisfaction  90 non-null     int64   
 12  development_obstacles    76 non-null     category
 13  job_search_duration      90 non-null     category
 14  higher_educa

In [19]:
df["city_preference"].cat.categories

AttributeError: Can only use .cat accessor with a 'category' dtype