## DPS BATCH#21 AI Engineering Track Challenge ##

In [1]:
""""Importing all the required Libraries & Dependencies"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib

In [2]:
"""Loading the dataset from the CSV file"""
file_path = "monatszahlen2307_verkehrsunfaelle_10_07_23_nosum.csv"
df = pd.read_csv(file_path)

In [3]:
"""Displaying the first few rows of the DataFrame (df) to check if the dataset loaded correctly"""

df.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT,VORJAHRESWERT,VERAEND_VORMONAT_PROZENT,VERAEND_VORJAHRESMONAT_PROZENT,ZWOELF_MONATE_MITTELWERT
0,Alkoholunfälle,insgesamt,2023,202301,,,,,
1,Alkoholunfälle,insgesamt,2023,202302,,,,,
2,Alkoholunfälle,insgesamt,2023,202303,,,,,
3,Alkoholunfälle,insgesamt,2023,202304,,,,,
4,Alkoholunfälle,insgesamt,2023,202305,,,,,


In [4]:
"""Filtering the dataset to include only values in the column before the year 2020"""

df = df[df['JAHR'] <= 2020]

In [5]:
"""Displaying the first few rows of the modified DataFrame (df) to check for any changes made in above cell)"""

df.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT,VORJAHRESWERT,VERAEND_VORMONAT_PROZENT,VERAEND_VORJAHRESMONAT_PROZENT,ZWOELF_MONATE_MITTELWERT
36,Alkoholunfälle,insgesamt,2020,202001,28.0,22.0,-20.0,27.27,37.0
37,Alkoholunfälle,insgesamt,2020,202002,40.0,28.0,42.86,42.86,38.0
38,Alkoholunfälle,insgesamt,2020,202003,27.0,34.0,-32.5,-20.59,37.0
39,Alkoholunfälle,insgesamt,2020,202004,26.0,36.0,-3.7,-27.78,36.0
40,Alkoholunfälle,insgesamt,2020,202005,40.0,39.0,53.85,2.56,36.0


## Selecting Important Columns in the DataFrame

Columns:
- 'MONATSZAHL'
- 'AUSPRAEGUNG'
- 'JAHR'
- 'MONAT'
- 'WERT'

Making the DataFrame include only these important columns.
## Selecting Important Columns in the DataFrame

Columns:
- 'MONATSZAHL'
- 'AUSPRAEGUNG'
- 'JAHR'
- 'MONAT'
- 'WERT'

Making the DataFrame include only these important columns.

In [6]:
features = ['MONATSZAHL', 'AUSPRAEGUNG', 'JAHR', 'MONAT', 'WERT']
df = df[features]



## Renaming columns in the DataFrame for better clarity.

Column Renaming:
- 'MONATSZAHL' to 'Category'
- 'AUSPRAEGUNG' to 'AccidentType'
- 'JAHR' to 'Year'
- 'MONAT' to 'Month'
- 'WERT' to 'Value'

In [7]:
df = df.rename( columns = { 'MONATSZAHL':'Category','AUSPRAEGUNG':'AccidentType','JAHR':'Year','MONAT':'Month','WERT':'Value'})

In [8]:
"""Displaying the first few rows of the modified DataFrame (df) to check for any changes made in above cell)"""
df.head()

Unnamed: 0,Category,AccidentType,Year,Month,Value
36,Alkoholunfälle,insgesamt,2020,202001,28.0
37,Alkoholunfälle,insgesamt,2020,202002,40.0
38,Alkoholunfälle,insgesamt,2020,202003,27.0
39,Alkoholunfälle,insgesamt,2020,202004,26.0
40,Alkoholunfälle,insgesamt,2020,202005,40.0



### Converting the 'Month' column in the DataFrame to a datetime format.The 'Month' column initially contains values like "202001" where the format is year followed by month.



In [9]:
df['Month'] = pd.to_datetime(df['Month'], format='%Y%m').dt.month

"""Displaying the first few rows of the DataFrame to verify the changes"""

df.head()

Unnamed: 0,Category,AccidentType,Year,Month,Value
36,Alkoholunfälle,insgesamt,2020,1,28.0
37,Alkoholunfälle,insgesamt,2020,2,40.0
38,Alkoholunfälle,insgesamt,2020,3,27.0
39,Alkoholunfälle,insgesamt,2020,4,26.0
40,Alkoholunfälle,insgesamt,2020,5,40.0


In [11]:
"""Resetting the index of the DataFrame."""

df.reset_index(drop=True, inplace=True)

In [12]:
"""Displaying the first few rows of the DataFrame to verify the changes"""
df.head()

Unnamed: 0,Category,AccidentType,Year,Month,Value
0,Alkoholunfälle,insgesamt,2020,1,28.0
1,Alkoholunfälle,insgesamt,2020,2,40.0
2,Alkoholunfälle,insgesamt,2020,3,27.0
3,Alkoholunfälle,insgesamt,2020,4,26.0
4,Alkoholunfälle,insgesamt,2020,5,40.0



### Displaying information about the DataFrame and checking for null values.

In [13]:
df.info()

# Check for null values in the DataFrame
null_values = df.isnull().sum()

# Display the count of null values for each column
print(null_values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1764 entries, 0 to 1763
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Category      1764 non-null   object 
 1   AccidentType  1764 non-null   object 
 2   Year          1764 non-null   int64  
 3   Month         1764 non-null   int32  
 4   Value         1764 non-null   float64
dtypes: float64(1), int32(1), int64(1), object(2)
memory usage: 62.1+ KB
Category        0
AccidentType    0
Year            0
Month           0
Value           0
dtype: int64
