In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

from collections import Counter
from collections import defaultdict

In [2]:
class_name = 'Occupancy'
df = pd.read_csv(r'../data/datatraining.txt', sep=",")
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
1,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
3,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
4,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
5,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [0]:
df['date'] =  pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M:%S')

In [0]:
df['Time'],df['Date']= df['date'].apply(lambda x:x.time()), df['date'].apply(lambda x:x.date())

In [0]:
df['Time'] =  pd.to_timedelta(str(x) for x in df['Time'])

In [0]:
df['Date'] =  pd.to_datetime(df['Date'], format='%Y-%m-%d')

In [0]:
print('Types of columns\n', df.dtypes)
print('\nNumber of columns', len(df.columns))
print('\nCount of column types\n', df.get_dtype_counts())
print('\nAll values\n', df.count())
print('\nNull values for columns\n', df.isna().sum())

In [0]:
df.drop('date', axis=1, inplace=True)

In [0]:
df

In [0]:
def prepare_dataset(df, class_name):
    df = remove_missing_values(df)
    numeric_columns = get_numeric_columns(df)
    rdf = df.copy(deep=True)
    df, feature_names, class_values = one_hot_encoding(df, class_name)
    real_feature_names = get_real_feature_names(rdf, numeric_columns, class_name)
    rdf = rdf[real_feature_names + (class_values if isinstance(class_name, list) else [class_name])]
    features_map = get_features_map(feature_names, real_feature_names)

    return df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map

def remove_missing_values(df):
    for column_name, nbr_missing in df.isna().sum().to_dict().items():
        if nbr_missing > 0:
            if column_name in df._get_numeric_data().columns:
                mean = df[column_name].mean()
                df[column_name].fillna(mean, inplace=True)
            else:
                mode = df[column_name].mode().values[0]
                df[column_name].fillna(mode, inplace=True)
    return df

def get_numeric_columns(df):
    numeric_columns = list(df._get_numeric_data().columns)
    return numeric_columns

def get_real_feature_names(rdf, numeric_columns, class_name):
    real_feature_names = [c for c in rdf.columns if c in numeric_columns and c != class_name]
    real_feature_names += [c for c in rdf.columns if c not in numeric_columns and c != class_name]
    return real_feature_names

def one_hot_encoding(df, class_name):
    dfX = pd.get_dummies(df[[c for c in df.columns if c != class_name]], prefix_sep='=')
    class_name_map = {v: k for k, v in enumerate(sorted(df[class_name].unique()))}
    dfY = df[class_name].map(class_name_map)
    df = pd.concat([dfX, dfY], axis=1, join_axes=[dfX.index])
    feature_names = list(dfX.columns)
    class_values = sorted(class_name_map)
    return df, feature_names, class_values

def get_features_map(feature_names, real_feature_names):
    features_map = defaultdict(dict)
    i = 0
    j = 0

    while i < len(feature_names) and j < len(real_feature_names):
        if feature_names[i] == real_feature_names[j]:
            features_map[j][feature_names[i]] = j
            i += 1
            j += 1
        elif feature_names[i].startswith(real_feature_names[j]):
            features_map[j][feature_names[i]] = j
            i += 1
        else:
            j += 1
    return features_map

In [0]:
res = prepare_dataset(df, class_name)
df, feature_names, class_values, numeric_columns, rdf, real_feature_names, features_map = res
df.head()

In [0]:
n=len(df)
var = ['Temperature','Humidity','HumidityRatio','Light','CO2']
for column in var:
    plt.figure()
    df[column].hist(bins=int(np.log2(n))+1)
    plt.suptitle(column)

In [0]:
plt.figure()
df['Occupancy'].value_counts().plot(kind='bar', title='Occupancy')

Una tra Humidity e HumidityRatio va eliminata per l'alta correlazione tra le due: abbiamo deciso di eliminare HumidityRatio poiché presenta una correlazione più alta con le altre variabili ed anche per una questione di interpretabilità: l'umidità relativa è una misurazione di cui un non esperto ha sicuramente già sentito parlare. In più l'HumidityRatio è una misura DERIVATA da Humidity e Temperature!

In [0]:
df.corr()

In [0]:
del df['HumidityRatio']

In [0]:
for column in df:
    if df[column].dtype == 'int64' or df[column].dtype == 'float64':
        gd = dict(markerfacecolor='g', marker='D')
        fig, ax = plt.subplots()
        ax.set_title('Outliers of ' +column)
        ax.boxplot(df[column], flierprops=gd)

In [0]:
df[df['CO2'] >= 1000].count()

In [0]:
df[df['Light'] >= 800].count()

NameError: ignored

In [0]:
df = df[df['Light'] <= 800]

NameError: ignored