# WDL 2021 - Final

# Introduction

# Data pre-processing

## Import libraries and customize seaborn

In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import pickle
import requests
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from math import floor, sin, cos, sqrt, atan2, radians, asin
#from keras.layers import Dense, Dropout, Flatten
#from keras.layers.convolutional import Conv1D, MaxPooling1D
#from keras.models import Sequential
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, MinMaxScaler
#import folium
from itertools import combinations
#import networkx as nx

pd.set_option('display.max_rows', 100)
sns.set_theme(style="whitegrid")
fig_dim = (16,9)

base = 'Data/noise_data/csv_format'
#base = '/home/bsilva/Desktop'
#base = r'C:\Users\Carolina Alves\OneDrive - Universidade de Aveiro\WDL competition\Stage 3/'

## Auxiliar functions

In [None]:
# bar plot function
def bar_plot(_x, _y, _data):
    fig, axs = plt.subplots(figsize=fig_dim)
    axs = sns.barplot(x=_x, y=_y, data=_data)
    plt.xticks(rotation=90)
    plt.draw()
    
# performs the encoding of categorical features to ordinal numbers
def encode_categorical_features(_df, features):
    df = _df.copy()
    enc = OrdinalEncoder()
    df[features] = enc.fit_transform(df[features]).astype(int)
    
    return enc, df

def correlation_matrix (df):
    cor = df.corr()
    fig,ax = plt.subplots(figsize=(6,5))
    sns.heatmap(cor, annot=True)

## Loading the dataset

In [19]:
# Opening and saving the original dataset
# We converted the file from csv to binary in order to read it faster
# You do not need to run this cell
def saving_dfs_as_binary_files(base):
    
    df = pd.read_csv(base+'/'+"san_salvario_" + str(2016) + ".csv", sep=";")
    
    for year in range(2017, 2022):
        _df_year = pd.read_csv(base+'/'+"san_salvario_" + str(year) + ".csv", sep=";")
        df = pd.concat([df, _df_year])
    
    pickle.dump(df, open(base+'/'+"df_raw", "wb"))
        
saving_dfs_as_binary_files(base)

In [42]:
# Opening data
def get_sensors_data():
    df = pickle.load(open(base+'/'+"df_raw", "rb" ))

    for col in ['C1', 'C2', 'C3', 'C4', 'C5']:
        df[col] = df[col].str.replace(',', '.').astype(float)

    df['date'] = pd.to_datetime(df["Data"], format='%d-%m-%Y')
    df['year'] = df["date"].dt.year
    df['month'] = df["date"].dt.month
    df['day'] = df["date"].dt.day
    df['hour'] = df['Ora'].str.replace(':00', '').astype(int)


    return df[['date', 'year', 'month', 'day', 'hour', 'C1', 'C2', 'C3', 'C4', 'C5']]
    
df = get_sensors_data()
df

Unnamed: 0,date,year,month,day,hour,C1,C2,C3,C4,C5
0,2016-06-01,2016,6,1,0,61.1,58.8,63.7,60.4,60.2
1,2016-06-01,2016,6,1,1,57.2,56.1,60.4,57.7,58.4
2,2016-06-01,2016,6,1,2,52.5,53.4,52.9,56.7,56.5
3,2016-06-01,2016,6,1,3,50.6,46.6,45.7,47.9,55.6
4,2016-06-01,2016,6,1,4,47.4,46.2,45.1,45.8,56.8
...,...,...,...,...,...,...,...,...,...,...
4147,2021-06-22,2021,6,22,19,54.4,56.2,58.8,,
4148,2021-06-22,2021,6,22,20,54.7,53.9,58.0,,
4149,2021-06-22,2021,6,22,21,53.6,48.8,62.5,,
4150,2021-06-22,2021,6,22,22,54.5,55.6,63.9,,


In [5]:
df.shape

(8760, 7)

## Data types, uniques and NaN information 

**Checking type of data, number of unique values and the presence of missing values and zeros**

In [None]:
def eda_to_df(df):
    header="+" + ("-"*112) + "+"
    form = "|{:^25s}|{:^10s}|{:^10s}|{:^10s}|{:^15s}|{:^10s}|{:^15s}|{:^10s}|"
    print(header)
    print(form.format("Column", "Type", "Uniques", "NaN?", "Number of NaN" ,"%NaN", "Number of 0s" ,"%0s"))
    print(header)
    for col in df.columns:
        print(form.format(str(col), # Column
                          str(df[col].dtypes), # Type
                          str(len(df[col].unique())), # Uniques
                          str(df[col].isnull().values.any()), # NaN?
                          str(df[col].isnull().sum()), # Number of NaNs
                          str(round(((df[col].isnull().sum())/len(df[col]))*100,5)), # %NaN
                          str((df[col] == 0).sum()), # Number of 0's
                          str(round((((df[col] == 0).sum())/len(df[col]))*100,5))) # %0's
              )
    print(header)    
    
    

eda_to_df(df)

ANÁLISE - ADICIONAR TEXTO

**Removing columns if needed**

In [None]:
#df = df.drop(['Importe', 'DescripcionImporte'], axis=1)

**Removing NaN if needed**

In [None]:
df = df[df['Id_Aparcamiento_Destino'].notna()]

In [None]:
#Saving cleaned dataframe
pickle.dump(df_clean_, open(path+'/'+"df_clean", "wb" ))

# Exploratory Data Analysis

DEIXEI PARA O CASO DE SER PRECISO UM PROCESSAMENTO SEMELHANTE

In [None]:
def get_loans_clean(base):

    df = pd.read_pickle(base + '/'+'df_clean')[["Id_Historico_Prestamo", "Id_Usuario", "Id_Tag_Bicicleta", \
            "Fecha_Prestamo", "Fecha_Devolucion", "Id_Aparcamiento_Origen", "Posicion_Origen", \
            "Id_Aparcamiento_Destino", "Posicion_Destino"]]
    
    # Rename columns
    # The same bike can have multiple tags over the year.
    df = df.rename(columns={
        "Id_Historico_Prestamo": "Loan ID",
        "Id_Usuario": "User ID",
        "Id_Tag_Bicicleta": "Tag ID",
        "Fecha_Prestamo": "Start loan",
        "Fecha_Devolucion": "End loan",
        "Id_Aparcamiento_Origen": "ID start station",
        "Posicion_Origen": "Position start",
        "Id_Aparcamiento_Destino": "ID end station",
        "Posicion_Destino": "Position end"
    })
    
    df['ID end station'] = df['ID end station'].astype(int)
    df['ID start station'] = df['ID start station'].astype(int)
    
    df["Start loan"] = pd.to_datetime(df["Start loan"], format='%Y-%m-%d %H:%M:%S.%f')
    df["End loan"] = pd.to_datetime(df["End loan"], format='%Y-%m-%d %H:%M:%S.%f')
    
    # Auxiliar fields
    df['year_prestamo'] = df["Start loan"].dt.year
    df['month_prestamo'] = df["Start loan"].dt.month
    df['day_prestamo'] = df["Start loan"].dt.day
    df['hour_prestamo'] = df["Start loan"].dt.hour
    
    df['year_devolucion'] = df["End loan"].dt.year
    df['month_devolucion'] = df["End loan"].dt.month
    df['day_devolucion'] = df["End loan"].dt.day
    df['hour_devolucion'] = df["End loan"].dt.hour
    
    df['duration_hour'] = (df["End loan"]-df["Start loan"]).dt.total_seconds().div(60*60)
    df['Fecha_Prestamo_htruncate'] = df['Start loan'].dt.floor('h')
        
    return df

df = get_loans_clean(base)
df.head()

# Another chapter

# Medidas de combate ao barulho

- meter a polícia a fazer patrulhas/organização
- meter capacidades máximas de pessoas em locais

# Conclusions

## Scalability and Impact

## Future Work

# References