# Data Preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
import matplotlib
plt.style.use('ggplot')
from matplotlib.pyplot import figure
import numpy as np
import seaborn as sns

In [None]:
def covert_to_lower(df):
    for i in df.columns:
        if df[i].dtypes == object:
            df[i] = df[i].str.lower()
        

def rename_drop_cols(df):
    cols_to_delete = []
    cols_to_rename_from = []
    cols_to_rename_to = []
    for i in df.columns:
        if '_y' in i:
            cols_to_delete.append(i)
        if '_x' in i:
            cols_to_rename_from.append(i)
            cols_to_rename_to.append(i.split('_x')[0])
    
    for i in range(len(cols_to_delete)):
        del df[cols_to_delete[i]]
    df.rename(columns=dict(zip(cols_to_rename_from, cols_to_rename_to)), inplace=True)



def splitdata(df,col,new_col,delimeter):
    df[new_col]= df[col].apply(lambda x: x.split(delimeter)[0] if x.find(delimeter)!=-1 else x )


def setdefaultmissingvalues(df) :
    for cols in df.columns:
        if df[cols].dtype == 'float64':
            df[cols] = df[cols].fillna(0.0)
        elif df[cols].dtype == 'O':
            df[cols] = df[cols].fillna('_MISSING_')
        elif df[cols].dtype == 'datetime64[ns]':
            df[cols] = df[cols].ffill()

def plotdata(dataframe):
    missing_data = dataframe.isnull().sum()
    df = pd.DataFrame({'name':missing_data.index, 'count':missing_data.values})

    sns.set(style="whitegrid", color_codes=True)
    sns.barplot(x = 'name', y = 'count', data=df,hue=df['count'])
    plt.xticks(rotation = 90)
    plt.show()

#check percentage of missing data in uom space dataset
    for col in dataframe.columns:
        pct_missing = np.mean(dataframe[col].isnull())
        print('{} - {}%'.format(col, round(pct_missing*100)))

In [None]:
# loading data
uom_space_df = pd.read_excel('C:/Users/deshp/Desktop/study material/DS/project/MAST90106/project-data/uom-space.xlsx')
rm_category_type_df = pd.read_excel('C:/Users/deshp/Desktop/study material/DS/project/MAST90106/project-data/rm-category-type-py-compatible.xlsx')
em_location_df = pd.read_excel('C:/Users/deshp/Desktop/study material/DS/project/MAST90106/project-data/em-location.xlsx')
av_equipment_df = pd.read_excel('C:/Users/deshp/Desktop/study material/DS/project/MAST90106/project-data/av-equipment.xlsx')
timetable_2020_df = pd.read_excel('C:/Users/deshp/Desktop/study material/DS/project/MAST90106/project-data/2020-timetable-v2.xlsx')


In [None]:
# covert_to_lower(uom_space_df)
# covert_to_lower(rm_category_type_df)
# covert_to_lower(em_location_df)
# covert_to_lower(timetable_2020_df)
# covert_to_lower(av_equipment_df)

In [None]:
uom_space_df.head()

In [None]:
plotdata(uom_space_df)

In [None]:
#replace missing values for Room Name,Room condition, Room capacity, putting default value as 0 for room capacity and 'Missing' for Room condition and room name.
# Excluding the data like room name which has over 90% missing value will make the result skewed or less accurate.
# discuss with other team members on their take on this.

setdefaultmissingvalues(uom_space_df)

In [None]:
# clean room category data
rm_category_type_df.head()

In [None]:
plotdata(rm_category_type_df)
# since the room type data is clear, no need to clean the data. Since we need to join the data for analysis purpose. 
# We check for the missing data on the join between room category and the uom space datasets.

In [None]:
#converting the data type of Room Category and merging the data type
uom_space_df['Room Type'] = uom_space_df['Room Type'].astype('str').str.strip()
rm_category_type_df['Room Type']=rm_category_type_df['Room Type'].str.strip()
rm_category_type_df['Room Category']=rm_category_type_df['Room Category'].str.strip()
rm_category_type_df['Room Type Definition']=rm_category_type_df['Room Type Definition'].str.lower().str.strip()
joined_uom_space_df = pd.merge(uom_space_df,rm_category_type_df,on=['Room Type'],how='left')
joined_uom_space_df.shape


In [None]:
rename_drop_cols(joined_uom_space_df)
plotdata(joined_uom_space_df)

In [None]:
em_location_df.head()

In [None]:
plotdata(em_location_df)

In [None]:
splitdata(em_location_df,'Room Code','Generic Room Code','.')
rename_drop_cols(em_location_df)

In [None]:
av_equipment_df.head(50)
#Does 0.1 floor code means basement floor?


In [None]:
plotdata(av_equipment_df)

In [None]:
# delete useless columns from the df.
timetable_2020_df['Scheduled Finish as end time'] = pd.to_numeric(timetable_2020_df['Scheduled Finish as end time'])
timetable_2020_df.head()

In [None]:
del timetable_2020_df['Name of Children']
del timetable_2020_df['Name of Allocated Locations of Children']
del timetable_2020_df['Scheduled?']
del timetable_2020_df['Booking?']

In [None]:
plotdata(timetable_2020_df)

In [None]:
#inserting default values into the missing columns. Since this data will be used for predicting the use of toilets depending upon the class times.
# We cannot afford to loose that data. # what is the impact of removing null data (9204) from timetable dataset(131857)
setdefaultmissingvalues(timetable_2020_df)