# Question 1


In [129]:
import pandas as pd
from datetime import datetime, timedelta
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from sklearn.metrics import mean_absolute_error
import re

parkingData = pd.read_csv('parkingLot.csv')

Finding irregularities in timestamps

In [140]:
class Model:
    def __init__(self, data):
        self.data = data
        self.data['timestamp'] = pd.to_datetime(self.data['timestamp'])
        self.data.sort_values(by='timestamp', inplace=True)

    # removes rows with time outside work hours i.e 5am to 12am
    def filterOutOfTime(self):
        mask = (self.data['timestamp'].dt.time >= pd.to_datetime('05:00:00').time()) & \
               (self.data['timestamp'].dt.time <= pd.to_datetime('23:59:59').time())
        self.data = self.data[mask]

    # removes rows with suspicious vehicle_no
    # not following (MH followed by two uppercase char followed by 4 digits)
    def correctPlate(self):
        pattern = re.compile(r'^MH[A-Z]{2}\d{4}')
        self.data = self.data[self.data['vehicle_no'].str.match(pattern, na=False)]

    # stores daily counts of vehicles for each camera
    def dailyCounts(self):
        self.data['date'] = self.data['timestamp'].dt.date
        group = self.data.groupby(['camera_id', 'date']).size().reset_index(name='vehicle_count')

        self.daily_counts = {}
        for camera_id, group in group.groupby('camera_id'):
            self.daily_counts[camera_id] = group.reset_index(drop=True)
    
    # splits the data into training and testing sets
    def split(self, metric, test_ratio = 0.2):
        train_size = int(len(self.data) * (1 - test_ratio))

        return self.data[:train_size][metric], self.data[train_size:][metric]

model = Model(parkingData)
model.filterOutOfTime()
model.correctPlate()
model.dailyCounts()



The relevant data cleaning is done as of now. 